## Plot ABCD demographics
To generate .csv files, run scripts/abcd_plots.py

In [1]:
import matplotlib.pyplot as plt
import sys
sys.path.append('../')
import numpy as np
import pandas as pd
import seaborn as sns
import scipy 
from scipy import stats
import ptitprince as pt
import matplotlib.patheffects as path_effects

from scripts.preprocess_utils import closest_value,find_centile,find_exact_percentile_return_number,add_median_labels

sns.set(style="whitegrid",font_scale = 2)

input_annotation_file = '../data/pop_norms.csv'
df = pd.read_csv(input_annotation_file, header=0)
df.loc[df.Age > 30, 'Age'] = 30
df['Sex'] = df['Gender'].map({2:'Female',1:'Male'})

df_centile_boys = pd.read_csv('../data/percentiles_chart_boys.csv',header=0)
df_centile_girls = pd.read_csv('../data/percentiles_chart_girls.csv',header=0)

In [3]:
df_ant = pd.read_csv("../data/ABCD-studies/abcd_bmi.csv", header=0)
corr, p = scipy.stats.pearsonr(df_ant['Weight'], df_ant['TMT PRED AVG filtered'])
print('Pearsons correlation for the TMT and weight: %.3f' % corr,p)
corr, p = scipy.stats.pearsonr(df_ant['BMI'], df_ant['TMT PRED AVG filtered'])
print('Pearsons correlation for the TMT and BMI: %.3f' % corr,p)
corr, p = scipy.stats.pearsonr(df_ant['Height'], df_ant['TMT PRED AVG filtered'])
print('Pearsons correlation for the TMT and Height: %.3f' % corr,p)

corr, p = scipy.stats.spearmanr(df_ant['Weight'], df_ant['TMT PRED AVG filtered'])
print('Spearmans correlation for the TMT and weight: %.3f' % corr,p)
corr, p = scipy.stats.spearmanr(df_ant['BMI'], df_ant['TMT PRED AVG filtered'])
print('Spearmans correlation for the TMT and BMI: %.3f' % corr,p)
corr, p = scipy.stats.spearmanr(df_ant['Height'], df_ant['TMT PRED AVG filtered'])
print('Spearmans correlation for the TMT and Height: %.3f' % corr,p)

Pearsons correlation for the TMT and weight: 0.655 0.0
Pearsons correlation for the TMT and BMI: 0.086 1.5429500976838774e-30
Pearsons correlation for the TMT and Height: 0.373 0.0
Spearmans correlation for the TMT and weight: 0.634 0.0
Spearmans correlation for the TMT and BMI: 0.630 0.0
Spearmans correlation for the TMT and Height: 0.383 0.0


## 1. Gender  and Geographic data

In [None]:
sns.set(style="whitegrid", font_scale=1.5)

fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(13, 6), 
                         sharex=True, gridspec_kw=dict(height_ratios=(1, 3), hspace=0))


df['Geographic data'] =df['Dataset'].map({'SALD':'Asia',
                                          'ABCD':'USA', 
                                          'AOMIC':'Europe',
                                         'HIMN':'USA',
                                          "ICBM":'USA/EU',
                                          "PING":'USA',
                                         'ABIDE':'USA/EU',
                                          "HAN":'USA',
                                          'Calgary':'USA',
                                         'BABY':'USA',
                                          'IXI':'Europe',
                                          'NYU':'USA',
                                         'Pixar':'USA',})

sns.countplot(data=df, x="Age",ax=axes[0], hue="Geographic data",palette="colorblind", alpha=.7)

axes[0].set_yscale("log")
sns.violinplot(
    x="Age", y="TMT PRED AVG filtered", hue="Sex", data=df,  
    split=True,  
    cut=0,
    ax=axes[1],
    palette="colorblind"
)

ax=plt.setp(axes[1].collections, alpha=.7)
ax=plt.setp(axes[0].collections, alpha=.7)

axes[1].legend("")
axes[0].legend("")

## 2. Hormone Saliva Tests

In [None]:
df_hrm = pd.read_csv("../data/ABCD-studies/abcd_hsss01.csv", header=0)

Normals for the children https://emedicine.medscape.com/article/2088870-overview?reg=1

In [None]:
dict_dhea_norms = {
    1:
    {8:{'min':11,'max':120},
      9:{'min':11,'max':120},
      10:{'min':11,'max':120},
      11:{'min':14,'max':323},
      12:{'min':5.5,'max':(312+323)/2},
      13:{'min':5.5,'max':312}},
    
    2:
    {8:{'min':16,'max':96},
      9:{'min':16,'max':96},
      10:{'min':22,'max':184},
      11:{'min':11,'max':296},
      12:{'min':17,'max':343},
      13:{'min':(17+57)/2,'max':(343+395)/2}
      }
}

act_list2 = []
centile_dx_list=[]
for idx in range(0, df_hrm.shape[0]):
    
    dhea = df_hrm.iloc[idx]['DHEA']
    gender =  df_hrm.iloc[idx]['gender']
    age = df_hrm.iloc[idx]['Age']
    first_q = dict_dhea_norms[gender][age]['min']
    third_q = dict_dhea_norms[gender][age]['max']
    
    if gender==2:
        sex=2
        centile_dx = find_exact_percentile_return_number(df_hrm['TMT PRED AVG filtered'].iloc[idx], age,df_centile_girls)
    else:
        sex=1
        centile_dx = find_exact_percentile_return_number(df_hrm['TMT PRED AVG filtered'].iloc[idx], age, df_centile_boys)
    centile_dx_list.append(centile_dx)
    
    if first_q > dhea:
        act_list2.append('<2.5%')
    elif first_q < dhea < third_q:
        act_list2.append('2.5%-97.5%')
    elif dhea > third_q:
        act_list2.append('97.5%>')
    else:
        act_list2.append("No data")
        
df_hrm['DHEA Reference Range'] = act_list2
df_hrm = df_hrm[df_hrm['DHEA Reference Range'] != "No data"]
df_hrm['Sex'] = df_hrm['gender'].map({2:'Female',1:'Male'})
df_hrm['Centiles'] = centile_dx_list

f, ax = plt.subplots(figsize=(4, 4))
sns.set(style="whitegrid",font_scale= 1.1)     
ax=sns.violinplot( x="DHEA Reference Range", y="Centiles",
            order=['2.5%-97.5%','<2.5%','97.5%>'], 
            data=df_hrm,dodge=False,width=0.7,palette = 'Greys',alpha=0.3,saturation = 4)
plt.setp(ax.collections, alpha=.6)
ax.set_yticks(range(0,101,10))
ax.axhline(y = 50, color = 'gray',linestyle="--")
plt.show()

print(df_hrm["DHEA Reference Range"].value_counts())

unique_cols=['2.5%-97.5%','<2.5%','97.5%>']
for i in unique_cols:
    for j in unique_cols: 
        stat,p=scipy.stats.mannwhitneyu(df_hrm[df_hrm['DHEA Reference Range']==i]['Centiles'], 
              df_hrm[df_hrm['DHEA Reference Range']==j]['Centiles'])# ranksums
        print(i,j,stat,p)
    break

In [None]:
#https://livhealth.com/testosterone-in-women-and-men/#:~:text=6%2D9%20years%20old%3A%20below,below%207%2D75%20ng%2Fdl
#https://www.healthcare.uiowa.edu/path_handbook/handbook/test97.html
dict_ert_norms = {
    1:
    {8:{'min':7,'max':20},
      9:{'min':7,'max':(20+130)/2},
      10:{'min':7,'max':130},
      11:{'min':7,'max':(130+800)/2},
      12:{'min':7,'max':800},
      13:{'min':7,'max':800}},

    
    2:
    {8:{'min':7,'max':20},
      9:{'min':7,'max':(20+44)/2},
      10:{'min':7,'max':44},
      11:{'min':7,'max':(44+75)/2},
      12:{'min':7,'max':75},
      13:{'min':7,'max':75}
      }
}

act_list2 = []
centile_dx_list=[]
for idx in range(0, df_hrm.shape[0]):
    
    ert = df_hrm.iloc[idx]['ERT']
    gender =  df_hrm.iloc[idx]['gender']
    age = df_hrm.iloc[idx]['Age']
    #print(hse)
    first_q = dict_ert_norms[gender][age]['min']
    third_q = dict_ert_norms[gender][age]['max']
    
    if df_hrm['gender'].iloc[idx]==2:
        sex=2
        centile_dx = find_exact_percentile_return_number(df_hrm['TMT PRED AVG filtered'].iloc[idx], age,df_centile_girls)
    else:
        sex=1
        centile_dx = find_exact_percentile_return_number(df_hrm['TMT PRED AVG filtered'].iloc[idx], age, df_centile_boys)
    centile_dx_list.append(centile_dx)
      
    if first_q > ert:
        act_list2.append('<2.5%')
    elif third_q > ert > first_q:
        act_list2.append('2.5%-97.5%')
    elif ert > third_q:
        act_list2.append('97.5%>')
    else:
        act_list2.append("No data")
        
df_hrm['ERT Levels'] = act_list2
df_hrm['Sex'] = df_hrm['gender'].map({2:'Female',1:'Male'})
df_hrm['Centiles'] = centile_dx_list

f, ax = plt.subplots(figsize=(4, 4))
sns.set(style="whitegrid",font_scale= 1.1)     
ax=sns.violinplot( x="ERT Levels", y="Centiles",
            order=['2.5%-97.5%','<2.5%','97.5%>'], 
            data=df_hrm,dodge=False,width=0.7,palette = 'Greys',alpha=0.3,saturation = 4)
plt.setp(ax.collections, alpha=.6)
ax.set_yticks(range(0,101,10))
ax.axhline(y = 50, color = 'gray',linestyle="--")
plt.show()

print(df_hrm["ERT Levels"].value_counts())

unique_cols=['2.5%-97.5%','<2.5%','97.5%>']
for i in unique_cols:
    for j in unique_cols: 
        stat,p=scipy.stats.mannwhitneyu(df_hrm[df_hrm['ERT Levels']==i]['Centiles'], 
              df_hrm[df_hrm['ERT Levels']==j]['Centiles'])# ranksums
        print(i,j,stat,p)
    break

## 3. BMI vs TMT: abcd_ant01

In [None]:
df_ant = pd.read_csv("../data/ABCD-studies/abcd_bmi.csv", header=0)

BMI norms according to CDC charts https://www.cdc.gov/healthyweight/assessing/bmi/childrens_bmi/about_childrens_bmi.html / the bmigerev.xsl file

In [None]:
dict_bmi_norms = {
    1:
    {8:{'min':14.9,'max':17.3},
      9:{'min':15.3,'max':17.9},
      10:{'min':15.7,'max':18.6},
      11:{'min':16.2,'max':19.3},
      12:{'min':16.7,'max':20.0},
      13:{'min':17.3,'max':20.8}},
    
    2:
    {8:{'min':14.9,'max':17.6},
      9:{'min':15.3,'max':18.3},
      10:{'min':15.7,'max':19.0},
      11:{'min':16.2,'max':19.8},
      12:{'min':16.8,'max':20.6},
      13:{'min':17.3,'max':21.3}
      }
}

act_list2 = []
centile_dx_list = []
for idx in range(0, df_ant.shape[0]):
    bmi = df_ant.iloc[idx]['BMI']
    gender =  df_ant.iloc[idx]['gender']
    age = df_ant.iloc[idx]['Age']
    
    first_q = dict_bmi_norms[gender][age]['min']
    third_q = dict_bmi_norms[gender][age]['max']

    if df_ant['gender'].iloc[idx]==2:
        sex=2
        centile_dx = find_exact_percentile_return_number(df_ant['TMT PRED AVG filtered'].iloc[idx], age,df_centile_girls)
    else:
        sex=1
        centile_dx = find_exact_percentile_return_number(df_ant['TMT PRED AVG filtered'].iloc[idx], age, df_centile_boys)
    centile_dx_list.append(centile_dx)
      
    if 14 > bmi:
        act_list2.append('Low')
    elif 30 > bmi >= 14:
        act_list2.append('Normal')
    else:
        act_list2.append("High")
        
df_ant['BMI Levels'] = act_list2
df_ant['Sex'] = df_ant['gender'].map({2:'Female',1:'Male'})
df_ant['Centiles'] = centile_dx_list

sns.set(rc={'figure.figsize':(4,4),'axes.facecolor':'white', 'figure.facecolor':'white'})
sns.set_style("whitegrid")

f, ax = plt.subplots(figsize=(4, 4))
sns.set(style="whitegrid",font_scale= 1.1)     
ax=sns.violinplot( x="BMI Levels", y="Centiles",
            order=['Normal','Low','High'], 
            data=df_ant,dodge=False,width=0.7,palette = 'Greys',alpha=0.3,saturation = 4)
plt.setp(ax.collections, alpha=.6)
ax.set_yticks(range(0,101,10))
ax.axhline(y = 50, color = 'gray',linestyle="--")
plt.show()

print(df_ant["BMI Levels"].value_counts())

unique_cols=['Normal','Low','High']
for i in unique_cols:
    for j in unique_cols: 
        stat,p=scipy.stats.mannwhitneyu(df_ant[df_ant['BMI Levels']==i]['Centiles'], 
              df_ant[df_ant['BMI Levels']==j]['Centiles'])# ranksums
        print(i,j,stat,p)
    break

In [None]:
#https://childrenswi.org/medical-care/adolescent-health-and-medicine/issues-and-concerns/adolescent-growth-and-development/normal-growth
dict_height_norms = {
    1:
    {8:{'min':47,'max':54},
      9:{'min':47,'max':54},
      10:{'min':50,'max':59},
      11:{'min':50,'max':59},
      12:{'min':54,'max':64},
      13:{'min':54,'max':64}},
    
    2:
    {8:{'min':47,'max':54},
      9:{'min':47,'max':54},
      10:{'min':50,'max':59},
      11:{'min':50,'max':59},
      12:{'min':55,'max':64},
      13:{'min':55,'max':64}
      }
}

act_list2 = []
for idx in range(0, df_ant.shape[0]):
    height = df_ant.iloc[idx]['Height']
    gender =  df_ant.iloc[idx]['gender']
    age = df_ant.iloc[idx]['Age']
    
    first_q = dict_height_norms[gender][age]['min']
    third_q = dict_height_norms[gender][age]['max']

    if first_q > height:
        act_list2.append('Low')
    elif first_q < height < third_q:
        act_list2.append('Median')
    elif height > third_q:
        act_list2.append('High')
    else:
        act_list2.append("No data")
        
df_ant['Height Levels'] = act_list2
df_ant['Sex'] = df_ant['gender'].map({2:'Female',1:'Male'})

f, ax = plt.subplots(figsize=(4, 4))

sns.set(style="whitegrid",font_scale= 1.1)     
df_ant['Gender'] = df_ant['Sex'].map({2:'Female',1:'Male'})
ax=sns.violinplot( x="Height Levels", y="Centiles",
            order=['Median','Low','High'], 
            data=df_ant,
            dodge=False,
               #hue='Category',
                  width=0.7,
               palette = 'Greys',
                  alpha=0.3,
               #cut=0,
                saturation = 4)
plt.setp(ax.collections, alpha=.6)
ax.set_yticks(range(0,101,10))
ax.axhline(y = 50, color = 'gray',linestyle="--")
plt.show()
print(df_ant["Height Levels"].value_counts())

unique_cols=['Median','Low','High']
for i in unique_cols:
    for j in unique_cols: 
        stat,p=stats.mannwhitneyu(df_ant[df_ant['Height Levels']==i]['Centiles'], 
              df_ant[df_ant['Height Levels']==j]['Centiles'])# ranksums
        print(i,j,stat,p)

In [None]:
#https://childrenswi.org/medical-care/adolescent-health-and-medicine/issues-and-concerns/adolescent-growth-and-development/normal-growth
dict_height_norms = {
    1:
    {8:{'min':46,'max':78},
      9:{'min':46,'max':78},
      10:{'min':54,'max':102},
      11:{'min':54,'max':102},
      12:{'min':66,'max':130},
      13:{'min':66,'max':130}},
    
    2:
    {8:{'min':44,'max':80},
      9:{'min':44,'max':80},
      10:{'min':54,'max':106},
      11:{'min':54,'max':106},
      12:{'min':68,'max':138},
      13:{'min':68,'max':138}
      }
}

act_list2 = []
centile_dx_list = []
for idx in range(0, df_ant.shape[0]):
    height = df_ant.iloc[idx]['Weight']
    gender =  df_ant.iloc[idx]['gender']
    age = df_ant.iloc[idx]['Age']
    
    first_q = dict_height_norms[gender][age]['min']
    third_q = dict_height_norms[gender][age]['max']
    
    if gender==2:
        centile_dx = find_exact_percentile_return_number(df_ant['TMT PRED AVG filtered'].iloc[idx], age,df_centile_girls)
    else:
        centile_dx = find_exact_percentile_return_number(df_ant['TMT PRED AVG filtered'].iloc[idx], age, df_centile_boys)
    centile_dx_list.append(centile_dx)
    
    if first_q > height:
        act_list2.append('Low')
    elif first_q < height < third_q:
        act_list2.append('Median')
    elif height > third_q:
        act_list2.append('High')
    else:
        act_list2.append("No data")
        
df_ant['Weight Levels'] = act_list2
df_ant['Sex'] = df_ant['gender'].map({2:'Female',1:'Male'})
df_ant['Centiles']=centile_dx_list

f, ax = plt.subplots(figsize=(4, 4))

sns.set(style="whitegrid",font_scale= 1.1)     
df_ant['Gender'] = df_ant['Sex'].map({2:'Female',1:'Male'})
ax=sns.violinplot( x="Weight Levels", y="Centiles",
            order=['Median','Low','High'], 
            data=df_ant,
            dodge=False,width=0.7,
               palette = 'Greys',alpha=0.3,
                saturation = 4)
plt.setp(ax.collections, alpha=.6)
ax.set_yticks(range(0,101,10))
ax.axhline(y = 50, color = 'gray',linestyle="--")
plt.show()
df_ant["Weight Levels"].value_counts()

unique_cols=['Median','Low','High']
for i in unique_cols:
    for j in unique_cols: 
        stat,p=stats.mannwhitneyu(df_ant[df_ant['Weight Levels']==i]['Centiles'], 
              df_ant[df_ant['Weight Levels']==j]['Centiles'])# ranksums
        print(i,j,stat,p)

## 4. Steps activity levels abcd_fbwpas01

Boys to average 12,000 to 16,000 steps/day and girls to average 10,000 to 13,000 steps/day;  https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3166269/
acc to
https://ijbnpa.biomedcentral.com/articles/10.1186/1479-5868-8-79 norm is between 7100-11000 steps/day


In [None]:
df_step = pd.read_csv("../data/ABCD-studies/abcd_steps.csv", header=0)
df_step['Sex'] = df_step['Gender'].map({2:'Female',1:'Male'})

healthy_male_lower =  12000
healthy_male_higher = 16000

healthy_female_lower = 10000
healthy_female_higher = 13000

steps_list,centile_dx_list =[],[]
for idx in range(0, df_step.shape[0]):
    row = df_step.iloc[idx]['Total steps']
    gender = df_step.iloc[idx]['Gender']
    
    if healthy_male_lower > row and gender==1:
        steps_list.append('Low')
    elif healthy_male_lower < row < healthy_male_higher and gender==1: 
        steps_list.append('Median')
    elif row > healthy_male_higher and gender==1:
        steps_list.append('High')
        
    elif healthy_female_lower > row and gender==2:
        steps_list.append('Low')
    elif healthy_female_lower < row < healthy_female_higher and gender==2: 
        steps_list.append('Median')
    elif row > healthy_female_higher and gender==2:
        steps_list.append('High')
        
    else:
        steps_list.append("No data")   
    
    if gender==2:
        centile_dx = find_exact_percentile_return_number(df_step['TMT PRED AVG filtered'].iloc[idx], age,df_centile_girls)
    elif gender==1:
        centile_dx = find_exact_percentile_return_number(df_step['TMT PRED AVG filtered'].iloc[idx], age, df_centile_boys)
    centile_dx_list.append(centile_dx)
    
df_step['Step levels'] = steps_list
df_step['Centiles']=centile_dx_list

f, ax = plt.subplots(figsize=(4, 4))
sns.set(style="whitegrid",font_scale= 1.1)     
ax=sns.violinplot( x="Step levels", y="Centiles",order=['Median','Low','High'], 
            data=df_step,dodge=False,palette = 'Greys',alpha=0.3,saturation = 4)
plt.setp(ax.collections, alpha=.6)
ax.set_yticks(range(0,101,10))
ax.axhline(y = 50, color = 'gray',linestyle="--")
plt.show()

unique_cols=['Median','Low','High']
for i in unique_cols:
    for j in unique_cols: 
        stat,p=stats.mannwhitneyu(df_step[df_step['Step levels']==i]['Centiles'], 
              df_step[df_step['Step levels']==j]['Centiles'])# ranksums
        print(i,j,stat,p)

## 5. Calorical Intake (abcd_bkfs01)

According to dietary reccomendations, healthy child kcal/d intake for range 9-13 years Female 1600 kcal/d, Male 1800 kcal/d https://www.heart.org/en/healthy-living/healthy-eating/eat-smart/nutrition-basics/dietary-recommendations-for-healthy-children
https://www.healthychildren.org/English/healthy-living/nutrition/Pages/Energy-In-Recommended-Food-Drink-Amounts-for-Children.aspx


In [None]:
df_cals = pd.read_csv("../data/ABCD-studies/abcd_cals.csv", header=0)
df_cals['Sex'] = df_cals['Gender'].map({2:'Female',1:'Male'})

healthy_male_lower = 1600
healthy_male_higher = 2600

healthy_female_lower = 1400
healthy_female_higher = 2200

cals_list,centile_dx_list =[],[]
for idx in range(0, df_cals.shape[0]):
    row = df_cals.iloc[idx]['Dt_kcal']
    gender = df_cals.iloc[idx]['Gender']
    
    if healthy_male_lower > row and gender==1:
        cals_list.append('Low')
    elif healthy_male_lower < row < healthy_male_higher and gender==1: 
        cals_list.append('Normal')
    elif row > healthy_male_higher and gender==1:
        cals_list.append('High')
        
    elif healthy_female_lower > row and gender==2:
        cals_list.append('Low')
    elif healthy_female_lower < row < healthy_female_higher and gender==2: 
        cals_list.append('Normal')
    elif row > healthy_female_higher and gender==2:
        cals_list.append('High')
        
    else:
        cals_list.append("No data")
        
    if gender==2:
        centile_dx = find_exact_percentile_return_number(df_cals['TMT PRED AVG filtered'].iloc[idx], age,df_centile_girls)
    elif gender==1:
        centile_dx = find_exact_percentile_return_number(df_cals['TMT PRED AVG filtered'].iloc[idx], age, df_centile_boys)
    centile_dx_list.append(centile_dx)
   
        
df_cals['kcal/d levels'] = cals_list
df_cals['Centiles']=centile_dx_list

f, ax = plt.subplots(figsize=(4, 4))
sns.set(style="whitegrid",font_scale= 1.1)     
ax=sns.violinplot( x="kcal/d levels", y="Centiles",order=['Normal','Low','High'], 
            data=df_cals,dodge=False,palette = 'Greys',alpha=0.3,saturation = 4)
plt.setp(ax.collections, alpha=.6)
ax.set_yticks(range(0,101,10))
ax.axhline(y = 50, color = 'gray',linestyle="--")
plt.show()

unique_cols=['Normal','Low','High']
for i in unique_cols:
    for j in unique_cols: 
        stat,p=stats.mannwhitneyu(df_cals[df_cals['kcal/d levels']==i]['Centiles'], 
              df_cals[df_cals['kcal/d levels']==j]['Centiles'])# ranksums
        print(i,j,stat,p)
    break

## 6. Blood Hemoglobin and Cholesterol abcd_ybd01

HDL Cholesterol in healthy kids younger <18 should be more than 45 mg/dl; non-HDL <120mg/dL https://medlineplus.gov/highcholesterolinchildrenandteens.html

In [None]:
df_blood = pd.read_csv("../data/ABCD-studies/abcd_blood.csv", header=0)
df_blood["Cholesterol level"] = np.where(df_blood['Cholesterol']>=170, "Above average", "Normal") 

centile_dx_list=[]
for idx in range(0, df_blood.shape[0]):
    gender =  df_blood.iloc[idx]['Gender']
    age = df_blood.iloc[idx]['Age']
    
    if df_blood['Gender'].iloc[idx]==2:
        sex=2
        centile_dx = find_exact_percentile_return_number(df_blood['TMT PRED AVG filtered'].iloc[idx], age,df_centile_girls)
    else:
        sex=1
        centile_dx = find_exact_percentile_return_number(df_blood['TMT PRED AVG filtered'].iloc[idx], age, df_centile_boys)
    centile_dx_list.append(centile_dx)

df_blood['Centiles']=centile_dx_list   

f, ax = plt.subplots(figsize=(4, 4))

sns.set(style="whitegrid",font_scale= 1.1)   
ax=sns.violinplot( x="Cholesterol level", y="Centiles",
            order=['Normal','Above average'], 
            data=df_blood,
            dodge=False,
               #hue='Category',
                  width=0.7,
               palette = 'Greys',
                  alpha=0.3,
               #cut=0,
                saturation = 4)
plt.setp(ax.collections, alpha=.6)
ax.set_yticks(range(0,101,10))
ax.axhline(y = 50, color = 'gray',linestyle="--")
plt.show()
print(df_blood["Cholesterol level"].value_counts())

unique_cols=['Normal','Above average']
for i in unique_cols:
    for j in unique_cols: 
        stat,p=stats.mannwhitneyu(df_blood[df_blood['Cholesterol level']==i]['Centiles'], 
              df_blood[df_blood['Cholesterol level']==j]['Centiles'])# ranksums
        print(i,j,stat,p)

In [None]:
f, ax = plt.subplots(figsize=(4, 4))

df_blood["HDL"] = np.where(df_blood['HDL Cholesterol']>=45, "Normal", "Below average") 

sns.set(style="whitegrid",font_scale= 1.1)     
ax=sns.violinplot( x="HDL", y="Centiles",
            order=['Normal','Below average'], 
            data=df_blood,
            dodge=False,
               #hue='Category',
                  width=0.5,
               palette = 'Greys',
                  alpha=0.3,
               #cut=0,
                saturation = 4)
plt.setp(ax.collections, alpha=.6)
ax.set_yticks(range(0,101,10))
ax.axhline(y = 50, color = 'gray',linestyle="--")
plt.show()
print(df_blood["HDL"].value_counts())

unique_cols=['Normal','Below average']
for i in unique_cols:
    for j in unique_cols: 
        stat,p=stats.mannwhitneyu(df_blood[df_blood['HDL']==i]['Centiles'], 
              df_blood[df_blood['HDL']==j]['Centiles'])# ranksums
        print(i,j,stat,p)