# Demo Chi-square test

In [10]:
import pandas as pd
import numpy as np
from scipy import stats

### test demo 0

In [19]:
data =  {
    "ED" : [10, 45, 5],
    "IPD": [30, 4, 20]
}

df = pd.DataFrame(data)
print(df) 
df_arr = df.to_numpy()
print(df_arr.transpose())
print(type(df_arr))

   ED  IPD
0  10   30
1  45    4
2   5   20
[[10 45  5]
 [30  4 20]]
<class 'numpy.ndarray'>


In [20]:
def demo_transform(dataframe: pd.DataFrame) -> np.ndarray: 
    return dataframe.to_numpy().transpose()

In [17]:
def evaluate_pvalue(pvalue: float, alpha=0.05) -> str:
    return "Dependent (reject H0)" if pvalue <= alpha else "Independent (H0 holds true)"

In [18]:
data = df_arr.transpose()
results = stats.chi2_contingency(data)
print(results)
pvalue_result = results.pvalue
print("pvalue:",pvalue_result)
print(evaluate_pvalue(pvalue_result))

Chi2ContingencyResult(statistic=np.float64(53.137528344671196), pvalue=np.float64(2.8928927226028307e-12), dof=2, expected_freq=array([[21.05263158, 25.78947368, 13.15789474],
       [18.94736842, 23.21052632, 11.84210526]]))
pvalue: 2.8928927226028307e-12
Dependent (reject H0)


### test demo 1

In [22]:
data =  {
    "ED" : [20, 4, 5],
    "IPD": [20, 4, 10]
}
df = pd.DataFrame(data)
data_arr = demo_transform(df)

results = stats.chi2_contingency(data_arr)
print(results)
pvalue_result = results.pvalue
print("pvalue:",pvalue_result)
print(evaluate_pvalue(pvalue_result))

Chi2ContingencyResult(statistic=np.float64(1.277890466531441), pvalue=np.float64(0.5278488879693956), dof=2, expected_freq=array([[18.41269841,  3.68253968,  6.9047619 ],
       [21.58730159,  4.31746032,  8.0952381 ]]))
pvalue: 0.5278488879693956
Independent (H0 holds true)


### test demo 2

In [23]:
data =  {
    "ED" : [20, 10, 5],
    "IPD": [20, 10, 100]
}
df = pd.DataFrame(data)
data_arr = demo_transform(df)

results = stats.chi2_contingency(data_arr)
print(results)
pvalue_result = results.pvalue
print("pvalue:",pvalue_result)
print(evaluate_pvalue(pvalue_result))

Chi2ContingencyResult(statistic=np.float64(46.754317111459976), pvalue=np.float64(7.037672236043776e-11), dof=2, expected_freq=array([[ 8.48484848,  4.24242424, 22.27272727],
       [31.51515152, 15.75757576, 82.72727273]]))
pvalue: 7.037672236043776e-11
Dependent (reject H0)


# demo with real data sample

In [69]:
# import
diagnosis_df = pd.read_csv("./data/diagnosis.csv") #ED
diagnosis_icd_df = pd.read_csv("./data/diagnoses_icd.csv") #IPD

In [70]:
# 100 head sample
ED100_df = diagnosis_df.loc[:99,["icd_code"]]
IPD100_df = diagnosis_icd_df.loc[:99,["icd_code"]]

display(ED100_df)
display(IPD100_df)

Unnamed: 0,icd_code
0,486
1,4254
2,5609
3,49392
4,7842
...,...
95,5770
96,5750
97,30500
98,29630


Unnamed: 0,icd_code
0,5723
1,78959
2,5715
3,07070
4,496
...,...
95,41071
96,5849
97,2875
98,7802


In [71]:
print(IPD100_df["icd_code"].value_counts().to_dict())

{'78959': 4, '5715': 4, '496': 4, 'V08': 3, '2761': 3, '3051': 3, 'V1582': 2, '29680': 2, 'F419': 2, 'I341': 2, 'E785': 2, '2875': 2, '2767': 2, 'V462': 2, 'F0280': 2, 'Z8546': 2, 'Z87891': 2, 'M810': 2, 'K219': 2, '5723': 1, '30981': 1, '07070': 1, '07054': 1, '78791': 1, 'G3183': 1, '30500': 1, 'V4986': 1, '7994': 1, '45829': 1, 'R4182': 1, 'R296': 1, 'R441': 1, '5283': 1, 'R609': 1, 'G20': 1, '52109': 1, 'K31819': 1, 'R0989': 1, '07044': 1, '07071': 1, 'K449': 1, 'R1310': 1, 'S72012A': 1, 'W010XXA': 1, 'Y93K1': 1, 'Y92480': 1, 'E7800': 1, 'G43909': 1, 'Z87442': 1, 'Z7901': 1, '9222': 1, '920': 1, 'E8854': 1, 'E8495': 1, '2860': 1, '2859': 1, '6820': 1, '1890': 1, 'V1201': 1, '42789': 1, '25000': 1, '4019': 1, '66401': 1, '65951': 1, '64891': 1, 'V270': 1, 'V0251': 1, '8020': 1, '41071': 1, '5849': 1, '7802': 1, '7847': 1}


In [72]:
print(ED100_df["icd_code"].value_counts().to_dict())

{'4019': 5, '30500': 4, '25000': 4, '486': 3, 'E9289': 3, '95901': 3, '7802': 3, '78097': 2, '78909': 2, '5856': 2, '311': 2, 'V4586': 2, 'V5861': 2, '3249': 1, '27651': 1, '6084': 1, '78079': 1, '7842': 1, '5609': 1, '49392': 1, '4254': 1, '7840': 1, '78605': 1, '87343': 1, '99673': 1, '5277': 1, '78703': 1, 'E9689': 1, '3320': 1, 'E8189': 1, '7825': 1, '8830': 1, '78650': 1, 'E9179': 1, '99674': 1, '591': 1, '25061': 1, '25011': 1, '87342': 1, '5363': 1, '30590': 1, '5939': 1, '5538': 1, '78906': 1, '5409': 1, '5259': 1, '7295': 1, '52100': 1, '79092': 1, '5920': 1, '4553': 1, '56400': 1, '78907': 1, '78791': 1, '7821': 1, '6827': 1, 'V4511': 1, '2724': 1, '5761': 1, '78060': 1, '7080': 1, '72671': 1, '59080': 1, 'E8120': 1, '55329': 1, '5990': 1, '30000': 1, '7089': 1, '9953': 1, '2989': 1, '78701': 1, '7242': 1, '5770': 1, '5750': 1, '29630': 1, 'V6284': 1}


In [73]:
# value_counts
# new dataframe
# concate
# fill zero
# transform
# chisquare

IPD100_df_count = IPD100_df.value_counts()
print(IPD100_df_count)
print(type(IPD100_df_count))

icd_code
496         4
78959       4
5715        4
2761        3
3051        3
           ..
W010XXA     1
Y93K1       1
Y92480      1
Z7901       1
Z87442      1
Name: count, Length: 72, dtype: int64
<class 'pandas.core.series.Series'>


In [74]:
# create dataframe
data = {
    "IPD": IPD100_df_count
}

df = pd.DataFrame(data)
display(df)

Unnamed: 0_level_0,IPD
icd_code,Unnamed: 1_level_1
496,4
78959,4
5715,4
2761,3
3051,3
...,...
W010XXA,1
Y93K1,1
Y92480,1
Z7901,1


In [79]:
ED100_df_count = ED100_df.value_counts()
ED100_df_dict = ED100_df_count.to_dict()
IPD100_df_dict = IPD100_df_count.to_dict()

In [80]:
# check intersec
ED100_df_dict.keys() & IPD100_df_dict.keys()

{('25000',), ('30500',), ('4019',), ('7802',), ('78791',)}

In [84]:
ED100_df_count = ED100_df_count.rename("ED")
output = pd.concat([df, ED100_df_count], axis=1, )
print(output)

          IPD   ED
icd_code          
496       4.0  NaN
78959     4.0  NaN
5715      4.0  NaN
2761      3.0  NaN
3051      3.0  NaN
...       ...  ...
E8189     NaN  1.0
E9689     NaN  1.0
E9179     NaN  1.0
V4511     NaN  1.0
V6284     NaN  1.0

[143 rows x 2 columns]


In [85]:
output.loc[["25000","30500","4019","7802","78791"], :]

Unnamed: 0_level_0,IPD,ED
icd_code,Unnamed: 1_level_1,Unnamed: 2_level_1
25000,1.0,4.0
30500,1.0,4.0
4019,1.0,5.0
7802,1.0,3.0
78791,1.0,1.0


In [86]:
output_fillzero = output.fillna(0)
output_fillzero

Unnamed: 0_level_0,IPD,ED
icd_code,Unnamed: 1_level_1,Unnamed: 2_level_1
496,4.0,0.0
78959,4.0,0.0
5715,4.0,0.0
2761,3.0,0.0
3051,3.0,0.0
...,...,...
E8189,0.0,1.0
E9689,0.0,1.0
E9179,0.0,1.0
V4511,0.0,1.0


In [87]:
transformed_data = demo_transform(output_fillzero)
print(transformed_data)

[[4. 4. 4. 3. 3. 3. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 4. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 3. 0. 4. 5. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  3. 3. 3. 2. 2. 2. 2. 2. 2. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
  1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]]


In [89]:
results = stats.chi2_contingency(transformed_data)
# print(results)
pvalue_result = results.pvalue
print("pvalue:",pvalue_result)
print(evaluate_pvalue(pvalue_result))

pvalue: 0.008570772774445887
Dependent (reject H0)
