In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency, f_oneway, kruskal

In [5]:
df = pd.read_csv('NPAR6.csv')
df.head(2)

Unnamed: 0,subject_id,hadm_id,stay_id,intime,race,gender,age,mortality_icu,mortality_hosp,los,...,alcoholic,biliary,drug_induced,unspecified,mortality_7d,mortality_28d,mortality_90d,mortality_1y,has_kidney_disease,has_sepsis
0,19753686,29945708,30469520,2152-04-07T19:23:59,UNKNOWN,M,50,0,0,4.701343,...,0,1,1,1,0,0,0,0,0,0
1,10442603,23644640,31663173,2125-02-25T15:33:43,UNKNOWN,M,67,1,1,1.110764,...,1,0,0,1,1,1,1,1,0,1


In [None]:
df['NPAR_group'] = pd.qcut(x= df['NPAR'], q=4, labels= ['Q1', 'Q2', 'Q3', 'Q4'])
df.head(2)

Unnamed: 0,subject_id,hadm_id,stay_id,intime,race,gender,age,mortality_icu,mortality_hosp,los,...,biliary,drug_induced,unspecified,mortality_7d,mortality_28d,mortality_90d,mortality_1y,has_kidney_disease,has_sepsis,NPAR_group
0,19753686,29945708,30469520,2152-04-07T19:23:59,UNKNOWN,M,50,0,0,4.701343,...,1,1,1,0,0,0,0,0,0,Q3
1,10442603,23644640,31663173,2125-02-25T15:33:43,UNKNOWN,M,67,1,1,1.110764,...,0,0,1,1,1,1,1,0,1,Q1


In [37]:
races = {
    'UNKNOWN': 'Others',
    'UNABLE TO OBTAIN': 'Others',
    'ASIAN - CHINESE': 'Others',
    'ASIAN - SOUTH EAST ASIAN': 'Others',
    'ASIAN - ASIAN INDIAN': 'Others',
    'ASIAN - KOREAN': 'Others',
    'ASIAN': 'Others',
    'HISPANIC OR LATINO': 'Others',
    'HISPANIC/LATINO - PUERTO RICAN': 'Others',
    'HISPANIC/LATINO - GUATEMALAN': 'Others',
    'HISPANIC/LATINO - DOMINICAN': 'Others',
    'HISPANIC/LATINO - MEXICAN': 'Others',
    'HISPANIC/LATINO - SALVADORAN': 'Others',
    'HISPANIC/LATINO - CENTRAL AMERICAN': 'Others',
    'HISPANIC/LATINO - CUBAN': 'Others',
    'HISPANIC/LATINO - COLUMBIAN': 'Others',
    'HISPANIC/LATINO - HONDURAN': 'Others',
    'NATIVE HAWAIIAN OR OTHER PACIFIC ISLANDER': 'Others',
    'AMERICAN INDIAN/ALASKA NATIVE': 'Others',
    'MULTIPLE RACE/ETHNICITY': 'Others',
    'SOUTH AMERICAN': 'Others',
    'OTHER': 'Others',
    'PATIENT DECLINED TO ANSWER': 'Others',
    'WHITE': 'White',
    'WHITE - OTHER EUROPEAN': 'White',
    'WHITE - RUSSIAN': 'White',
    'WHITE - EASTERN EUROPEAN': 'White',
    'WHITE - BRAZILIAN': 'White',
    'PORTUGUESE': 'White',
    'BLACK/AFRICAN AMERICAN': 'Black',
    'BLACK/CARIBBEAN ISLAND': 'Black',
    'BLACK/CAPE VERDEAN': 'Black',
    'BLACK/AFRICAN': 'Black',
}
df['race'] = df['race'].map(races)

## Chi-square test

## Chi-square test
Chọn mức ý nghĩa alpha = 0.05

Giả thuyết

H_0: kiểm tra ảnh hưởng của Male và Female có giống nhau không

H_1: Male và Female khác nhau => (bác bỏ H0 -> pvalue < alpha)


In [None]:
# var = 'gender'
# obs = pd.crosstab(df[var], df['npar_group'])
# res = chi2_contingency(obs)
# res.pvalue


def chi_square_test(var, results):
    obs = pd.crosstab(df[var], df['NPAR_group'])
    res = chi2_contingency(obs)
    results.append({
        'Variable': var,
        'Q1': '',
        'Q2': '',
        'Q3': '',
        'Q4': '',
        'pvalue': '<0.001' if res.pvalue < 0.001 else f'{res.pvalue:.3f}'
    })
    percents = obs.div(obs.sum(axis=1), axis=0) * 100
    tabs = obs.astype(str) + '(' + percents.round(1).astype(str) + '%)'
    for name in obs.index:
        dic = {}
        dic['Variable'] = name
        for col in obs.columns:
            dic[col] = tabs.loc[name, col]
        dic['pvalue'] = ''
        results.append(dic)
    return results
results = []
vars = ['gender', 'race', 'mortality_7d', 'mortality_28d', 'mortality_90d', 'mortality_1y', 'has_kidney_disease', 'has_sepsis']
for var in vars:
    chi_square_test(var, results)
pd.DataFrame(results)



Unnamed: 0,Variable,Q1,Q2,Q3,Q4,pvalue
0,gender,,,,,0.092
1,F,63(23.2%),59(21.8%),80(29.5%),69(25.5%),
2,M,95(26.6%),97(27.2%),77(21.6%),88(24.6%),
3,race,,,,,<0.001
4,Black,31(50.0%),10(16.1%),9(14.5%),12(19.4%),
5,Others,36(20.0%),40(22.2%),54(30.0%),50(27.8%),
6,White,91(23.6%),106(27.5%),94(24.4%),95(24.6%),
7,mortality_7d,,,,,0.004
8,0,153(26.0%),150(25.5%),148(25.1%),138(23.4%),
9,1,5(12.8%),6(15.4%),9(23.1%),19(48.7%),


# F-Test

$$H0: \mu Q1 = \mu Q2 = \mu Q3 = \mu Q4 
% $$H0: \mu_{Q_1} = \mu_{Q_2} = \mu_{Q_3} = \mu_{Q_4}$$ 

In [None]:
def f_test(var, results):
    #names = sorted(df['NPAR_group'].unique().tolist())
    names = ['Q1', 'Q2', 'Q3', 'Q4']
    groups = [df[df['NPAR_group'] == name][var].dropna() for name in names]
    res = f_oneway(*groups)
    dic = {}
    dic['Variable'] = var
    dic['Total'] = f'{df[var].mean():.1f} ± {df[var].std():.1f}'
    for name, group in zip(names, groups):
        dic[name] = f'{group.mean():.1f} ± {group.std():.1f}'
    dic['pvalue'] = '<0.001' if res.pvalue < 0.001 else f'{res.pvalue:.3f}'
    results.append(dic)
    return results

results = []
vars = ['age', 'heart_rate_mean',
        'resp_rate_mean', 'mbp_mean', 'temperature_mean', 'spo2_mean']
for var in vars:
    f_test(var, results)
pd.DataFrame(results)





Unnamed: 0,Variable,Total,Q1,Q2,Q3,Q4,pvalue
0,age,57.6 ± 17.2,55.2 ± 16.9,59.3 ± 17.3,56.6 ± 18.2,59.4 ± 16.3,0.082
1,heart_rate_mean,96.5 ± 18.1,94.1 ± 18.1,93.1 ± 17.2,100.6 ± 17.5,98.3 ± 18.6,<0.001
2,resp_rate_mean,21.1 ± 4.3,19.8 ± 4.0,20.9 ± 4.2,22.0 ± 4.5,21.7 ± 4.4,<0.001
3,mbp_mean,81.4 ± 13.3,86.9 ± 14.4,80.2 ± 13.5,80.6 ± 11.7,77.8 ± 11.6,<0.001
4,temperature_mean,37.0 ± 0.6,36.9 ± 0.5,37.1 ± 0.7,37.1 ± 0.6,36.8 ± 0.7,<0.001
5,spo2_mean,96.3 ± 2.1,96.3 ± 2.4,96.2 ± 2.2,96.1 ± 2.0,96.4 ± 1.9,0.720


# Kruskal-test


In [46]:
def kruskal_test(var, results):
    #names = sorted(df['NPAR_group'].unique().tolist())
    names = ['Q1', 'Q2', 'Q3', 'Q4']
    groups = [df[df['NPAR_group'] == name][var].dropna() for name in names]
    res = kruskal(*groups)
    dic = {}
    dic['Variable'] = var
    dic['Total'] = f'{df[var].median():.1f} {df[var].quantile(0.25):.1f} - {df[var].quantile(0.75):.1f}'
    for name, group in zip(names, groups):
        dic[name] = f'{group.median():.1f} ± {group.quantile(0.25):.1f} - {group.quantile(0.75):.1f}'
    dic['pvalue'] = '<0.001' if res.pvalue < 0.001 else f'{res.pvalue:.3f}'
    results.append(dic)
    return results

results = []
vars = ['oasis', 'sofa','cci', 'apsiii']
for var in vars:
    kruskal_test(var, results)
pd.DataFrame(results)

Unnamed: 0,Variable,Total,Q1,Q2,Q3,Q4,pvalue
0,oasis,34.0 28.0 - 41.0,30.0 ± 24.0 - 36.0,33.0 ± 27.0 - 39.0,35.0 ± 29.0 - 43.0,37.0 ± 33.0 - 43.0,<0.001
1,sofa,6.5 3.0 - 10.0,5.0 ± 2.0 - 8.0,6.0 ± 3.0 - 10.0,7.0 ± 4.0 - 10.0,8.0 ± 5.0 - 11.0,<0.001
2,cci,4.0 2.0 - 6.0,3.0 ± 1.0 - 5.0,4.0 ± 2.0 - 6.0,3.0 ± 2.0 - 5.0,4.0 ± 3.0 - 6.0,0.037
3,apsiii,53.0 40.0 - 75.0,45.0 ± 34.2 - 59.8,48.0 ± 38.0 - 65.2,54.0 ± 42.0 - 69.0,73.0 ± 53.0 - 90.0,<0.001


In [None]:
# obs.sum(axis=1)

gender
F    266
M    336
dtype: int64

In [None]:
# percents = obs.div(obs.sum(axis=1), axis=0) * 100
# obs.astype(str) + '(' + percents.round(1).astype(str) + '%)'

npar_group,Q1,Q2,Q3,Q4
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,64(24.1%),53(19.9%),74(27.8%),75(28.2%)
M,87(25.9%),97(28.9%),76(22.6%),76(22.6%)


In [None]:
# percents = obs.div(obs.sum(axis=0), axis=1) * 100
# percents.round(1).astype(str) + '%'

npar_group,Q1,Q2,Q3,Q4
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,42.4%,35.3%,49.3%,49.7%
M,57.6%,64.7%,50.7%,50.3%
