In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import matplotlib.pyplot as plt

from pandas.plotting import table 
from functools import reduce

-----------------------------------------

In [2]:
# import depressive datasets
df_depressive = pd.read_spss(r"../data/Data_Emma_Raitoharju_BDI.sav")
df_depressive.rename(columns={df_depressive.columns[0]: 'patientID'}, inplace=True)

df_depressive['patientID'] = df_depressive['patientID'].astype(int)
df_depressive.head()

Unnamed: 0,patientID,b1,b2,b3,b4,b5,b6,b7,b8,b9,...,b12_12,b13_12,b14_12,b15_12,b16_12,b17_12,b18_12,b19_12,b20_12,b21_12
0,1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,11.0,0.0,11.0,0.0,1.0,0.0
1,2,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,4,,,,,,,,,,...,0.0,0.0,0.0,0.0,12.0,1.0,0.0,0.0,1.0,0.0
4,5,,,,,,,,,,...,,,,,,,,,,


In [3]:
# import cvd dataset and append the sheets to 1 dataframe
cvd_path = r"../data/yfs_phenodata_tycho.xlsx"
cvd_excel = pd.ExcelFile(cvd_path)
cvd_sheets = []

for sheet in cvd_excel.sheet_names:
    sheet = pd.read_excel(cvd_path, sheet_name=sheet)
    cvd_sheets.append(sheet)

for dataframe in cvd_sheets:
    dataframe.rename(columns={dataframe.columns[0]: 'patientID'}, inplace=True)

df_cvd = reduce(lambda  left,right: pd.merge(left,right,on=['patientID'],
                                            how='outer'), cvd_sheets)
df_cvd.head()

Unnamed: 0,patientID,ika07,SP_x,bmi07,smoke07,apoa107,Ace07,syst07,dkv07,totkol07,...,Ace12,syst12,dkv12,totkol12,ldlkol12,hdlkol12,trigly12,insu12,gluk12,crp12
0,1,39,1,36.934441,0.0,1.32,0.04281,129.333333,78.666667,4.7,...,0.04412,151.333333,86.333333,7.4,5.3,0.88,2.7,16.0,5.2,7.65
1,2,39,1,22.481329,0.0,1.73,0.04924,124.0,76.666667,5.2,...,0.045,125.333333,87.333333,5.4,3.3,1.56,1.2,6.0,5.9,0.26
2,3,39,1,23.355637,0.0,1.71,0.0473,108.0,73.333333,4.4,...,0.07207,109.333333,68.666667,5.0,3.0,1.55,1.0,5.0,4.5,1.16
3,4,39,1,,,,,,,,...,,,,,,,,,,
4,5,39,1,27.1809,0.0,1.61,0.04298,122.666667,89.333333,3.7,...,0.04604,112.0,67.333333,3.8,2.1,1.46,0.5,4.0,4.7,0.56


--------------------------------

In [4]:
# Recode the values
def recode_values(value):  
    if value == 0:
        return 0
    elif value in [2, 3, 11, 12]:
        return 1
    elif value in [4, 5, 21, 22]:
        return 2
    elif value in [6, 7, 31, 32]:
        return 3
    else:
        return value

columns_to_recode = ['b16', 'b18', 'b16_12', 'b18_12', 'deprb1611', 'deprb1811']
for col in columns_to_recode:
    df_depressive[col] = df_depressive[col].apply(recode_values)

In [5]:
columns = ['deprb0111', 'deprb0211', 'deprb0311', 'deprb0411', 'deprb0511', 'deprb0611', 'deprb0711',
           'deprb0811', 'deprb0911', 'deprb1011', 'deprb1111', 'deprb1211', 'deprb1311', 'deprb1411', 
           'deprb1511', 'deprb1711', 'deprb1911', 'deprb2011', 'deprb2111']
df_depressive[columns] = df_depressive[columns].apply(lambda x: x - 1)

df_depressive.head()

Unnamed: 0,patientID,b1,b2,b3,b4,b5,b6,b7,b8,b9,...,b12_12,b13_12,b14_12,b15_12,b16_12,b17_12,b18_12,b19_12,b20_12,b21_12
0,1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
1,2,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
3,4,,,,,,,,,,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0
4,5,,,,,,,,,,...,,,,,,,,,,


----------------

In [6]:
# Merge dataframes
df_merged = pd.merge(df_depressive, df_cvd, on='patientID')
df_merged.head()

Unnamed: 0,patientID,b1,b2,b3,b4,b5,b6,b7,b8,b9,...,Ace12,syst12,dkv12,totkol12,ldlkol12,hdlkol12,trigly12,insu12,gluk12,crp12
0,1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.04412,151.333333,86.333333,7.4,5.3,0.88,2.7,16.0,5.2,7.65
1,2,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.045,125.333333,87.333333,5.4,3.3,1.56,1.2,6.0,5.9,0.26
2,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.07207,109.333333,68.666667,5.0,3.0,1.55,1.0,5.0,4.5,1.16
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,,,,,,,,,,...,0.04604,112.0,67.333333,3.8,2.1,1.46,0.5,4.0,4.7,0.56


In [7]:
new_column_names = {
    'b1':'sadness_2007_D',
    'deprb0111':'sadness_2011_D',
    'b1_12':'sadness_2012_D',
    'b2':'pessimism_2007_D',
    'deprb0211':'pessimism_2011_D',
    'b2_12':'pessimism_2012_D',
    'b3':'pastFailure_2007_D',
    'deprb0311':'pastFailure_2011_D',
    'b3_12':'pastFailure_2012_D',
    'b4':'lossOfPleasure_2007_D',
    'deprb0411':'lossOfPleasure_2011_D',
    'b4_12':'lossOfPleasure_2012_D',
    'b5':'guiltyFeelings_2007_D',
    'deprb0511':'guiltyFeelings_2011_D',
    'b5_12':'guiltyFeelings_2012_D',
    'b6':'punishmentFeelings_2007_D',
    'deprb0611':'punishmentFeelings_2011_D',
    'b6_12':'punishmentFeelings_2012_D',
    'b7':'selfDislike_2007_D',
    'deprb0711':'selfDislike_2011_D',
    'b7_12':'selfDislike_2012_D',
    'b8':'selfCriticalness_2007_D',
    'deprb0811':'selfCriticalness_2011_D',
    'b8_12':'selfCriticalness_2012_D',
    'b9':'suicidalThoughtOrWishes_2007_D',
    'deprb0911':'suicidalThoughtOrWishes_2011_D',
    'b9_12':'suicidalThoughtOrWishes_2012_D',
    'b10':'crying_2007_D',
    'deprb1011':'crying_2011_D',
    'b10_12':'crying_2012_D',
    'b11':'agitation_2007_D',
    'deprb1111':'agitation_2011_D',
    'b11_12':'agitation_2012_D',
    'b12':'lossOfInterest_2007_D',
    'deprb1211':'lossOfInterest_2011_D',
    'b12_12':'lossOfInterest_2012_D',
    'b13':'indecisiveness_2007_D',
    'deprb1311':'indecisiveness_2011_D',
    'b13_12':'indecisiveness_2012_D',
    'b14':'worthlessness_2007_D',
    'deprb1411':'worthlessness_2011_D',
    'b14_12':'worthlessness_2012_D',
    'b15':'lossOfEnergy_2007_D',
    'deprb1511':'lossOfEnergy_2011_D',
    'b15_12':'lossOfEnergy_2012_D',
    'b16':'changesInSleepPattern_2007_D',
    'deprb1611':'changesInSleepPattern_2011_D',
    'b16_12':'changesInSleepPattern_2012_D',
    'b17':'irritability_2007_D',
    'deprb1711':'irritability_2011_D',
    'b17_12':'irritability_2012_D',
    'b18':'changesInAppetite_2007_D',
    'deprb1811':'changesInAppetite_2011_D',
    'b18_12':'changesInAppetite_2012_D',
    'b19':'concentrationDifficulty_2007_D',
    'deprb1911':'concentrationDifficulty_2011_D',
    'b19_12':'concentrationDifficulty_2012_D',
    'b20':'tirednessOrFatigue_2007_D',
    'deprb2011':'tirednessOrFatigue_2011_D',
    'b20_12':'tirednessOrFatigue_2012_D',
    'b21':'lossOfInterestInSex_2007_D',
    'deprb2111':'lossOfInterestInSex_2011_D',
    'b21_12':'lossOfInterestInSex_2012_D',

    "ika07": "age_2007_CO",
    "ika11": "age_2011_CO",
    "ika12": "age_2012_CO",
    "smoke07": "smoking_2007_CO",
    "smoke11": "smoking_2011_CO",
    "smoke12": "smoking_2012_CO",
    "SP": "sex_2007_CO",
    "SP_x": "sex_2011_CO",
    "SP_y": "sex_2012_CO",
    "bmi07": "bmi_2007_CO",
    "BMI11": "bmi_2011_CO",
    "BMI12": "bmi_2012_CO",

    "Ace07": "acetate_2007_CVD",
    "Ace11": "acetate_2011_CVD",
    "Ace12": "acetate_2012_CVD",
    "apoa107": "apoprotein_2007_CVD",
    "APOA111": "apoprotein_2011_CVD",
    "apoa112": "apoprotein_2012_CVD",
    "crp07": "c-reactiveProtein_2007_CVD",
    "CRP11": "c-reactiveProtein_2011_CVD",
    "crp12": "c-reactiveProtein_2012_CVD",
    "dkv07": "diastolicKV_2007_CVD",
    "DKV11": "diastolicKV_2011_CVD",
    "dkv12": "diastolicKV_2012_CVD",
    "gluk07": "gluk_2007_CVD",
    "GLUK11": "gluk_2011_CVD",
    "gluk12": "gluk_2012_CVD",
    "HDLKOL11": "cholesterolHDL_2007_CVD",
    "hdlkol07": "cholesterolHDL_2011_CVD",
    "hdlkol12": "cholesterolHDL_2012_CVD",
    "insu07": "insu_2007_CVD",
    "INSU11": "insu_2011_CVD",
    "insu12": "insu_2012_CVD",
    "ldlkol07": "cholesterolLDL_2007_CVD",
    "ldlkol11": "cholesterolLDL_2011_CVD",
    "ldlkol12": "cholesterolLDL_2012_CVD",
    "syst07": "systolicBloodPressure_2007_CVD",
    "SYST11": "systolicBloodPressure_2011_CVD",
    "syst12": "systolicBloodPressure_2012_CVD",
    "totkol07": "cholesterolTotal_2007_CVD",
    "TOTKOL11": "cholesterolTotal_2011_CVD",
    "totkol12": "cholesterolTotal_2012_CVD",
    "trigly07": "triglycerides_2007_CVD",
    "TRIGLY11": "triglycerides_2011_CVD",
    "trigly12": "triglycerides_2012_CVD",
}

In [8]:
# df_depressive = df_depressive.rename(columns=new_depressive_column_names)
# df_cvd = df_cvd.rename(columns=new_cvd_column_names)
df_merged = df_merged.rename(columns=new_column_names)

df_merged.head()

Unnamed: 0,patientID,sadness_2007_D,pessimism_2007_D,pastFailure_2007_D,lossOfPleasure_2007_D,guiltyFeelings_2007_D,punishmentFeelings_2007_D,selfDislike_2007_D,selfCriticalness_2007_D,suicidalThoughtOrWishes_2007_D,...,acetate_2012_CVD,systolicBloodPressure_2012_CVD,diastolicKV_2012_CVD,cholesterolTotal_2012_CVD,cholesterolLDL_2012_CVD,cholesterolHDL_2012_CVD,triglycerides_2012_CVD,insu_2012_CVD,gluk_2012_CVD,c-reactiveProtein_2012_CVD
0,1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.04412,151.333333,86.333333,7.4,5.3,0.88,2.7,16.0,5.2,7.65
1,2,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.045,125.333333,87.333333,5.4,3.3,1.56,1.2,6.0,5.9,0.26
2,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.07207,109.333333,68.666667,5.0,3.0,1.55,1.0,5.0,4.5,1.16
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,,,,,,,,,,...,0.04604,112.0,67.333333,3.8,2.1,1.46,0.5,4.0,4.7,0.56


---------------------------

In [9]:
df_merged.isna().sum()

patientID                        0
sadness_2007_D                1544
pessimism_2007_D              1545
pastFailure_2007_D            1546
lossOfPleasure_2007_D         1546
                              ... 
cholesterolHDL_2012_CVD       1315
triglycerides_2012_CVD        1313
insu_2012_CVD                 1314
gluk_2012_CVD                 1313
c-reactiveProtein_2012_CVD    1313
Length: 109, dtype: int64

In [10]:
df_merged = df_merged.interpolate()
df_merged.isna().sum()

patientID                     0
sadness_2007_D                0
pessimism_2007_D              0
pastFailure_2007_D            0
lossOfPleasure_2007_D         0
                             ..
cholesterolHDL_2012_CVD       0
triglycerides_2012_CVD        0
insu_2012_CVD                 0
gluk_2012_CVD                 0
c-reactiveProtein_2012_CVD    0
Length: 109, dtype: int64

In [11]:
df_merged.head()

Unnamed: 0,patientID,sadness_2007_D,pessimism_2007_D,pastFailure_2007_D,lossOfPleasure_2007_D,guiltyFeelings_2007_D,punishmentFeelings_2007_D,selfDislike_2007_D,selfCriticalness_2007_D,suicidalThoughtOrWishes_2007_D,...,acetate_2012_CVD,systolicBloodPressure_2012_CVD,diastolicKV_2012_CVD,cholesterolTotal_2012_CVD,cholesterolLDL_2012_CVD,cholesterolHDL_2012_CVD,triglycerides_2012_CVD,insu_2012_CVD,gluk_2012_CVD,c-reactiveProtein_2012_CVD
0,1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.04412,151.333333,86.333333,7.4,5.3,0.88,2.7,16.0,5.2,7.65
1,2,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.045,125.333333,87.333333,5.4,3.3,1.56,1.2,6.0,5.9,0.26
2,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.07207,109.333333,68.666667,5.0,3.0,1.55,1.0,5.0,4.5,1.16
3,4,0.0,0.0,0.0,0.0,0.75,0.0,0.0,0.0,0.0,...,0.059055,110.666667,68.0,4.4,2.55,1.505,0.75,4.5,4.6,0.86
4,5,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,...,0.04604,112.0,67.333333,3.8,2.1,1.46,0.5,4.0,4.7,0.56


In [12]:
# Export df_merged for the descriptives
df_merged.to_csv('../data/descriptive_dataset.csv', index=False)

In [29]:
df_merged.iloc[:, 1:].max()

sadness_2007_D                 3.00
pessimism_2007_D               3.00
pastFailure_2007_D             3.00
lossOfPleasure_2007_D          3.00
guiltyFeelings_2007_D          3.00
                              ...  
cholesterolHDL_2012_CVD        2.82
triglycerides_2012_CVD        12.90
insu_2012_CVD                 89.00
gluk_2012_CVD                 20.30
c-reactiveProtein_2012_CVD    64.90
Length: 108, dtype: float64

-------------------------------

In [13]:
def calculate_residu(data, year):
    print(f"year: {year}")
    
    covariance_var = [col for col in data if col.endswith("CO")] + ["patiendID"]
    print(f"len covariance list: {len(covariance_var)}")
    outcome_vars =  list(set(data.columns) - set(covariance_var))

    print(f"len outcome list: {len(outcome_vars)}\n")
    X = data[[f'sex_{year}_CO', f'age_{year}_CO']]
    X = sm.add_constant(X)
    
    outcome_list = []
    for outcome in outcome_vars:
        y = data[outcome].values  # Convert outcome variable to numpy array
        X_values = X.values  # Convert predictor variables to numpy array
        model = sm.OLS(y, X_values)  # Use OLS directly with numpy arrays
        
        results = model.fit()
        residuals = results.resid
        outcome_list.append(pd.Series(residuals, name=outcome))
        
    outcome_list = pd.concat(outcome_list, axis=1)
    return outcome_list

In [14]:
years = list({re.search(r'_(\d{4})_', col).group(1) for col in df_merged.columns 
              if re.search(r'_(\d{4})_', col)})
years = sorted(years, key=lambda x: int(x))
print(f"years in df: {years}\n")

df_residu_list = [df_merged["patientID"]]

for year in years:
    columns_with_year = [col for col in df_merged.columns if re.search(fr'_{year}_', col)]
    data = df_merged[columns_with_year]

    df_residu_list.append(calculate_residu(data, year))
    
df_residu = pd.concat(df_residu_list, axis=1)
df_residu.head()

years in df: ['2007', '2011', '2012']

year: 2007
len covariance list: 5
len outcome list: 32

year: 2011
len covariance list: 5
len outcome list: 32

year: 2012
len covariance list: 5
len outcome list: 32



Unnamed: 0,patientID,c-reactiveProtein_2007_CVD,worthlessness_2007_D,changesInAppetite_2007_D,crying_2007_D,suicidalThoughtOrWishes_2007_D,cholesterolHDL_2007_CVD,lossOfEnergy_2007_D,insu_2007_CVD,selfDislike_2007_D,...,acetate_2012_CVD,crying_2012_D,sadness_2012_D,changesInSleepPattern_2012_D,irritability_2012_D,triglycerides_2012_CVD,lossOfEnergy_2012_D,lossOfInterestInSex_2012_D,cholesterolLDL_2012_CVD,cholesterolTotal_2012_CVD
0,1,3.622174,-0.185202,0.536953,-0.196018,-0.068178,-0.09154,0.569385,12.589573,-0.198712,...,-0.000615,-0.168389,-0.157675,0.343788,-0.283528,1.500118,0.612067,-0.467787,2.098647,2.276046
1,2,-1.617826,-0.185202,-0.463047,-0.196018,-0.068178,-0.06154,-0.430615,-0.720427,-0.198712,...,0.000265,-0.168389,-0.157675,0.343788,0.049806,0.000118,0.278734,-0.467787,0.098647,0.276046
2,3,-1.007826,-0.185202,-0.463047,-0.196018,-0.068178,0.24846,-0.430615,-5.020427,-0.198712,...,0.027335,-0.168389,-0.157675,0.343788,0.383139,-0.199882,-0.054599,-0.467787,-0.201353,-0.123954
3,4,-0.487826,-0.185202,-0.463047,-0.196018,-0.068178,0.21346,-0.430615,-3.230427,-0.198712,...,0.01432,-0.168389,-0.157675,0.343788,0.716472,-0.449882,-0.387933,-0.467787,-0.651353,-0.723954
4,5,0.032174,-0.185202,-0.463047,-0.196018,-0.068178,0.17846,-0.430615,-1.440427,-0.198712,...,0.001305,-0.168389,-0.157675,0.010454,0.383139,-0.699882,-0.387933,-0.467787,-1.101353,-1.323954


-----------------

In [15]:
sort_columns = {
    'deprb0111':'sadness_2011_D',
    'b1':'sadness_2007_D',
    'b1_12':'sadness_2012_D',
    'b2':'pessimism_2007_D',
    'deprb0211':'pessimism_2011_D',
    'b2_12':'pessimism_2012_D',
    'b3':'pastFailure_2007_D',
    'deprb0311':'pastFailure_2011_D',
    'b3_12':'pastFailure_2012_D',
    'b4':'lossOfPleasure_2007_D',
    'deprb0411':'lossOfPleasure_2011_D',
    'b4_12':'lossOfPleasure_2012_D',
    'b5':'guiltyFeelings_2007_D',
    'deprb0511':'guiltyFeelings_2011_D',
    'b5_12':'guiltyFeelings_2012_D',
    'b6':'punishmentFeelings_2007_D',
    'deprb0611':'punishmentFeelings_2011_D',
    'b6_12':'punishmentFeelings_2012_D',
    'b7':'selfDislike_2007_D',
    'deprb0711':'selfDislike_2011_D',
    'b7_12':'selfDislike_2012_D',
    'b8':'selfCriticalness_2007_D',
    'deprb0811':'selfCriticalness_2011_D',
    'b8_12':'selfCriticalness_2012_D',
    'b9':'suicidalThoughtOrWishes_2007_D',
    'deprb0911':'suicidalThoughtOrWishes_2011_D',
    'b9_12':'suicidalThoughtOrWishes_2012_D',
    'b10':'crying_2007_D',
    'deprb1011':'crying_2011_D',
    'b10_12':'crying_2012_D',
    'b11':'agitation_2007_D',
    'deprb1111':'agitation_2011_D',
    'b11_12':'agitation_2012_D',
    'b12':'lossOfInterest_2007_D',
    'deprb1211':'lossOfInterest_2011_D',
    'b12_12':'lossOfInterest_2012_D',
    'b13':'indecisiveness_2007_D',
    'deprb1311':'indecisiveness_2011_D',
    'b13_12':'indecisiveness_2012_D',
    'b14':'worthlessness_2007_D',
    'deprb1411':'worthlessness_2011_D',
    'b14_12':'worthlessness_2012_D',
    'b15':'lossOfEnergy_2007_D',
    'deprb1511':'lossOfEnergy_2011_D',
    'b15_12':'lossOfEnergy_2012_D',
    'b16':'changesInSleepPattern_2007_D',
    'deprb1611':'changesInSleepPattern_2011_D',
    'b16_12':'changesInSleepPattern_2012_D',
    'b17':'irritability_2007_D',
    'deprb1711':'irritability_2011_D',
    'b17_12':'irritability_2012_D',
    'b18':'changesInAppetite_2007_D',
    'deprb1811':'changesInAppetite_2011_D',
    'b18_12':'changesInAppetite_2012_D',
    'b19':'concentrationDifficulty_2007_D',
    'deprb1911':'concentrationDifficulty_2011_D',
    'b19_12':'concentrationDifficulty_2012_D',
    'b20':'tirednessOrFatigue_2007_D',
    'deprb2011':'tirednessOrFatigue_2011_D',
    'b20_12':'tirednessOrFatigue_2012_D',
    'b21':'lossOfInterestInSex_2007_D',
    'deprb2111':'lossOfInterestInSex_2011_D',
    'b21_12':'lossOfInterestInSex_2012_D',

    "Ace07": "acetate_2007_CVD",
    "Ace11": "acetate_2011_CVD",
    "Ace12": "acetate_2012_CVD",
    "apoa107": "apoprotein_2007_CVD",
    "APOA111": "apoprotein_2011_CVD",
    "apoa112": "apoprotein_2012_CVD",    
    "crp07": "c-reactiveProtein_2007_CVD",
    "CRP11": "c-reactiveProtein_2011_CVD",
    "crp12": "c-reactiveProtein_2012_CVD",
    "dkv07": "diastolicKV_2007_CVD",
    "DKV11": "diastolicKV_2011_CVD",
    "dkv12": "diastolicKV_2012_CVD",    
    "gluk07": "gluk_2007_CVD",
    "GLUK11": "gluk_2011_CVD",
    "gluk12": "gluk_2012_CVD",
    "HDLKOL11": "cholesterolHDL_2007_CVD",
    "hdlkol07": "cholesterolHDL_2011_CVD",
    "hdlkol12": "cholesterolHDL_2012_CVD",    
    "insu07": "insu_2007_CVD",
    "INSU11": "insu_2011_CVD",
    "insu12": "insu_2012_CVD",
    "ldlkol07": "cholesterolLDL_2007_CVD",
    "ldlkol11": "cholesterolLDL_2011_CVD",
    "ldlkol12": "cholesterolLDL_2012_CVD",    
    "syst07": "systolicBloodPressure_2007_CVD",
    "SYST11": "systolicBloodPressure_2011_CVD",
    "syst12": "systolicBloodPressure_2012_CVD",
    "totkol07": "cholesterolTotal_2007_CVD",
    "TOTKOL11": "cholesterolTotal_2011_CVD",
    "totkol12": "cholesterolTotal_2012_CVD",
    "trigly07": "triglycerides_2007_CVD",
    "TRIGLY11": "triglycerides_2011_CVD",
    "trigly12": "triglycerides_2012_CVD",
}

In [16]:
sort_list = ["patientID"] + list(sort_columns.values())
df_residu = df_residu[sort_list]

In [17]:
df_residu.to_csv('../data/dataset.csv', index=False)

In [27]:
df_residu.iloc[:, 1:].min().min()

-40.037513766778645