In [12]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import kruskal, mannwhitneyu

from itertools import product
 

In [13]:
df = pd.read_csv('data/IST_corrected_clean.csv',  index_col= [0])

In [14]:
def print_stats(stat, p):
    print('stat = %.3f, p = %.3f' % (stat, p))
    if p > 0.05:
	    print('Probably the same distribution')
    else:
	    print('Probably different distributions')

# Hypothesis testing

+ aspirin vs non-aspirin
+ heparin vs non-heparin
    + non vs low vs medium heparin
+ combination of aspirin and heparin
    + (significantly) different outcome than separate effects of aspirin and heparin

## Aspirin vs  No aspirin



### Outcome after 14 days

The endpoint after 14 days is 'DDEAD'. It includes information whether a patient has passed away within 14 days or not.

Group patients and their status concerning Aspirin treatment (Y/N) - and their outcome after 14 days.

In [15]:
list_asp14 = df.groupby('DASP14')['DDEAD'].apply(list)

In [16]:
list_asp14

DASP14
N    [N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...
Y    [N, N, N, N, N, N, Y, Y, N, N, N, N, N, N, N, ...
Name: DDEAD, dtype: object

In [17]:
stat, p = kruskal(list_asp14[0], list_asp14[1])
print('Kruskal Wallis')
print_stats(stat, p)

Kruskal Wallis
stat = 10.405, p = 0.001
Probably different distributions


In [46]:
group_asp = df.groupby('DASP14')
group_asp['DDEAD'].value_counts()

DASP14  DDEAD
N       N        8820
        Y        1122
Y       N        7933
        Y         865
Name: DDEAD, dtype: int64

In [47]:
asp14_nn = group_asp['DDEAD'].value_counts()[0]
asp14_ny = group_asp['DDEAD'].value_counts()[1]
asp14_yn = group_asp['DDEAD'].value_counts()[2]
asp14_yy = group_asp['DDEAD'].value_counts()[3]

In [49]:
print(f"--- No aspirin during study ---")
print(f'{round((asp14_nn / (asp14_nn + asp14_ny) * 100),1)} % of patients living 14 days after their stroke')
print()
print(f"--- Aspirin during study ---")
print(f'{round((asp14_yn/ (asp14_yn + asp14_yy) * 100),1)} % of patients living 14 days after their stroke')

--- No aspirin during study ---
88.7 % of patients living 14 days after their stroke

--- Aspirin during study ---
90.2 % of patients living 14 days after their stroke


When comparing the groups by implementing the Kruskal Wallis Test, we see that there is a significant difference between patients who were given Aspirin in comparison to those who didn't, after 14 days. More patients that did not receive Aspirin passed away compared to those who received Aspirin.


### Outcome after 6 months

There are two endpoints that are interesting after 6 months, that is 'FRECOVER', which is the information of full recovery, as well as 'FDEAD', which is information on patients who have passed away.

In [45]:
list_asp6 = df.groupby('DASP14')['FRECOVER'].apply(list)
print(list_asp6)
print()
stat, p = kruskal(list_asp6[0], list_asp6[1])
print('Kruskal Wallis')
print_stats(stat, p)

DASP14
N    [N, Y, N, N, N, Y, N, N, N, N, N, N, N, N, Y, ...
Y    [N, N, N, N, N, N, nan, nan, N, N, N, N, Y, N,...
Name: FRECOVER, dtype: object

Kruskal Wallis
stat = 2.949, p = 0.086
Probably the same distribution


No difference in terms of recovery when patients received aspirin or not. Though, there is a significant difference between patients who died and who received aspirin compared to those who did not.

In [20]:
list_asp6 = df.groupby('DASP14')['FDEAD'].apply(list)
print(list_asp6)
print()
stat, p = kruskal(list_asp6[0], list_asp6[1])
print('Kruskal Wallis')
print_stats(stat, p)

DASP14
N    [N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...
Y    [N, N, N, N, N, N, Y, Y, N, N, N, N, N, N, N, ...
Name: FDEAD, dtype: object

Kruskal Wallis
stat = 20.668, p = 0.000
Probably different distributions


In [27]:
group_asp['FDEAD'].value_counts()

DASP14  FDEAD
N       N        7561
        Y        2381
Y       N        6936
        Y        1862
Name: FDEAD, dtype: int64

In [28]:
asp6_nn = group_asp['FDEAD'].value_counts()[0]
asp6_ny = group_asp['FDEAD'].value_counts()[1]
asp6_yn = group_asp['FDEAD'].value_counts()[2]
asp6_yy = group_asp['FDEAD'].value_counts()[3]

In [33]:
print(f"--- No aspirin during study ---")
print(f'{round((asp6_nn / (asp6_nn + asp6_ny) * 100),1)} % of patients living 6 months after their stroke')
print()
print(f"--- Aspirin during study ---")
print(f'{round((asp6_yn/ (asp6_yn + asp6_yy) * 100),1)} % of patients living 6 months after their stroke')

--- No aspirin during study ---
76.1 % of patients living 6 months after their stroke

--- Aspirin during study ---
78.8 % of patients living 6 months after their stroke


## Heparin vs non-heparin


### Outcome after 14 days

In [103]:
list_hep14 = df.groupby(by = ['RXHEP14'])['DDEAD'].apply(list)

In [104]:
list_hep14

RXHEP14
N    [N, N, N, N, N, N, N, Y, Y, N, N, N, N, N, N, ...
Y    [N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...
Name: DDEAD, dtype: object

In [105]:
stat, p = kruskal(list_hep14[0], list_hep14[1])
print_stats(stat, p)

stat = 0.756, p = 0.385
Probably the same distribution


After 14 days: No difference between heparin and no heparin.

### Outcome after 6 months

In [179]:
list_hep6 = df.groupby(by = ['RXHEP14'])['FRECOVER'].apply(list)
stat, p = kruskal(list_hep6[0] , list_hep6[1])
print_stats(stat, p)

stat = 0.552, p = 0.458
Probably the same distribution


In [180]:
list_hep6 = df.groupby(by = ['RXHEP14'])['FDEAD'].apply(list)
stat, p = kruskal(list_hep6[0] , list_hep6[1])
print_stats(stat, p)

stat = 0.877, p = 0.349
Probably the same distribution


## No vs low vs medium heparin


## Outcome after 14 days

In [115]:
list2_hep14 = df.groupby(by = ['HEP14'])['DDEAD'].apply(list)

In [116]:
list2_hep14 

HEP14
L    [N, N, N, N, N, N, N, N, N, N, N, Y, N, N, N, ...
M    [N, N, N, N, N, N, N, N, N, Y, N, Y, N, N, N, ...
N    [N, N, N, N, N, N, N, Y, Y, N, N, N, N, N, N, ...
Name: DDEAD, dtype: object

In [117]:
stat, p = kruskal(list2_hep14[0], list2_hep14[1], list2_hep14[2])
print_stats(stat, p)

stat = 2.035, p = 0.362
Probably the same distribution


After 14 days: No difference between low, medium and no heparin.

### Outcome after 6 months

In [181]:
list2_hep6 = df.groupby(by = ['HEP14'])['FRECOVER'].apply(list)
stat, p = kruskal(list2_hep6[0], list2_hep6[1], list2_hep6[2])
print_stats(stat, p)

stat = 0.756, p = 0.685
Probably the same distribution


In [182]:
list2_hep6 = df.groupby(by = ['HEP14'])['FDEAD'].apply(list)
stat, p = kruskal(list2_hep6[0], list2_hep6[1], list2_hep6[2])
print_stats(stat, p)

stat = 1.050, p = 0.592
Probably the same distribution


## Combination of aspirin and heparin
(significantly) different outcome than separate effects of aspirin and heparin

In [63]:
df.TREAT14.unique()

array(['Aspirin', 'Low Heparin', 'Medium Heparin',
       'Aspirin + Medium Heparin', 'Control', 'Aspirin + Low Heparin'],
      dtype=object)

### Outcome after 14 days

In [143]:
list_treat14 = df.groupby(by = ['TREAT14'])['DDEAD'].apply(list)

In [144]:
list_treat14

TREAT14
Aspirin                     [N, N, Y, Y, N, N, N, N, N, N, N, N, N, N, N, ...
Aspirin + Low Heparin       [N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, ...
Aspirin + Medium Heparin    [N, N, N, N, N, Y, N, N, N, N, N, N, N, Y, N, ...
Control                     [N, N, N, N, N, N, N, N, N, N, Y, N, N, N, Y, ...
Low Heparin                 [N, N, N, N, N, Y, N, N, N, N, N, N, N, N, N, ...
Medium Heparin              [N, N, N, N, Y, N, N, N, N, N, N, N, N, N, N, ...
Name: DDEAD, dtype: object

In [145]:
stat, p = kruskal(list_treat14[0], list_treat14[1], list_treat14[2], list_treat14[3],list_treat14[4], list_treat14[5])
print_stats(stat, p)

stat = 14.675, p = 0.012
Probably different distributions


In [146]:
iterator = product(enumerate(list_treat14), enumerate(list_treat14))
for (first_idx, first_group), (second_idx, second_group) in iterator:
    print(first_idx, second_idx)
    stat, p = mannwhitneyu(first_group, second_group)
    print_stats(stat, p)
    print()

0 0
stat = 11007432.000, p = 0.500
Probably the same distribution

0 1
stat = 4971980.000, p = 0.220
Probably the same distribution

0 2
stat = 4646727.500, p = 0.127
Probably the same distribution

0 3
stat = 12954773.500, p = 0.001
Probably different distributions

0 4
stat = 5105552.000, p = 0.083
Probably the same distribution

0 5
stat = 4965158.000, p = 0.080
Probably the same distribution

1 0
stat = 4971980.000, p = 0.220
Probably the same distribution

1 1
stat = 2272712.000, p = 0.500
Probably the same distribution

1 2
stat = 2098778.500, p = 0.052
Probably the same distribution

1 3
stat = 5850916.500, p = 0.001
Probably different distributions

1 4
stat = 2305992.000, p = 0.033
Probably different distributions

1 5
stat = 2242578.000, p = 0.032
Probably different distributions

2 0
stat = 4646727.500, p = 0.127
Probably the same distribution

2 1
stat = 2098778.500, p = 0.052
Probably the same distribution

2 2
stat = 1998000.500, p = 0.500
Probably the same distribution



In [35]:
group_treat = df.groupby(by = ['TREAT14'])
group_treat['DDEAD'].value_counts()

TREAT14                   DDEAD
Aspirin                   N        4223
                          Y         457
Aspirin + Low Heparin     N        1930
                          Y         195
Aspirin + Medium Heparin  N        1780
                          Y         213
Control                   N        4957
                          Y         652
Low Heparin               N        1961
                          Y         238
Medium Heparin            N        1906
                          Y         232
Name: DDEAD, dtype: int64

In [36]:
treat14_a_n = group_treat['DDEAD'].value_counts()[0]
treat14_a_y = group_treat['DDEAD'].value_counts()[1]

treat14_alh_n = group_treat['DDEAD'].value_counts()[2]
treat14_alh_y = group_treat['DDEAD'].value_counts()[3]

treat14_amh_n = group_treat['DDEAD'].value_counts()[4]
treat14_amh_y = group_treat['DDEAD'].value_counts()[5]

treat14_c_n = group_treat['DDEAD'].value_counts()[6]
treat14_c_y = group_treat['DDEAD'].value_counts()[7]

treat14_lh_n = group_treat['DDEAD'].value_counts()[8]
treat14_lh_y = group_treat['DDEAD'].value_counts()[9]

treat14_mh_n = group_treat['DDEAD'].value_counts()[10]
treat14_mh_y = group_treat['DDEAD'].value_counts()[11]


In [44]:
print(f"--- Aspirin during study ---")
print(f'{round((treat14_a_n / (treat14_a_n + treat14_a_y) * 100),1)} % of patients living 14 days after their stroke')
print(f'-- Significantly different compared to the control group (p < 0.05)')
print()
print(f"--- Aspirin + Low Heparin during study ---")
print(f'{round((treat14_alh_n / (treat14_alh_n + treat14_alh_y) * 100),1)} % of patients living 14 days after their stroke')
print(f'-- Significantly different compared to both Heparin groups and the control group (p < 0.05)')
print()
print(f"--- Aspirin + Medium Heparin during study ---")
print(f'{round((treat14_amh_n / (treat14_amh_n + treat14_amh_y) * 100),1)} % of patients living 14 days after their stroke')
print()
print(f"--- Low Heparin during study ---")
print(f'{round((treat14_lh_n / (treat14_lh_n + treat14_lh_y) * 100),1)} % of patients living 14 days after their stroke')
print()
print(f"--- Medium Heparin during study ---")
print(f'{round((treat14_mh_n / (treat14_mh_n + treat14_mh_y) * 100),1)} % of patients living 14 days after their stroke')
print()
print(f"--- Control (neither Aspirin nor Heparin during study) ---")
print(f'{round((treat14_c_n / (treat14_c_n + treat14_c_y) * 100),1)} % of patients living 14 days after their stroke')
print()

--- Aspirin during study ---
90.2 % of patients living 14 days after their stroke
-- Significantly different compared to the control group (p < 0.05)

--- Aspirin + Low Heparin during study ---
90.8 % of patients living 14 days after their stroke
-- Significantly different compared to both Heparin groups and the control group (p < 0.05)

--- Aspirin + Medium Heparin during study ---
89.3 % of patients living 14 days after their stroke

--- Low Heparin during study ---
89.2 % of patients living 14 days after their stroke

--- Medium Heparin during study ---
89.1 % of patients living 14 days after their stroke

--- Control (neither Aspirin nor Heparin during study) ---
88.4 % of patients living 14 days after their stroke



### Outcome after 6 months

In [153]:
list_treat6 = df.groupby(by = ['TREAT14'])['FRECOVER'].apply(list)

In [154]:
stat, p = kruskal(list_treat6[0], list_treat6[1], list_treat6[2], list_treat6[3],list_treat6[4], list_treat6[5])
print_stats(stat, p)

stat = 4.687, p = 0.455
Probably the same distribution


In [187]:
list_treat6 = df.groupby(by = ['TREAT14'])['FDEAD'].apply(list)
stat, p = kruskal(list_treat6[0], list_treat6[1], list_treat6[2], list_treat6[3],list_treat6[4], list_treat6[5])
print_stats(stat, p)

stat = 23.753, p = 0.000
Probably different distributions


In [41]:
list_treat6

TREAT14
Aspirin                     [N, N, Y, Y, N, N, N, N, N, N, N, N, N, N, N, ...
Aspirin + Low Heparin       [N, N, N, N, N, N, Y, N, N, N, N, N, N, Y, N, ...
Aspirin + Medium Heparin    [N, N, N, N, N, Y, N, N, N, N, N, N, N, Y, N, ...
Control                     [N, N, N, N, N, N, N, N, N, N, Y, N, N, N, Y, ...
Low Heparin                 [N, N, N, N, N, Y, N, N, Y, N, N, N, N, N, N, ...
Medium Heparin              [N, N, N, N, Y, N, N, N, N, N, N, N, N, N, N, ...
Name: FDEAD, dtype: object

In [8]:
iterator = product(enumerate(list_treat6), enumerate(list_treat6))
for (first_idx, first_group), (second_idx, second_group) in iterator:
    print(first_idx, second_idx)
    stat, p = mannwhitneyu(first_group, second_group)
    print_stats(stat, p)
    print()

0 0
stat = 11007432.000, p = 0.500
Probably the same distribution

0 1
stat = 4977864.000, p = 0.327
Probably the same distribution

0 2
stat = 4652452.000, p = 0.234
Probably the same distribution

0 3
stat = 12764314.000, p = 0.000
Probably different distributions

0 4
stat = 5028574.000, p = 0.008
Probably different distributions

0 5
stat = 4916824.000, p = 0.027
Probably different distributions

1 0
stat = 4977864.000, p = 0.327
Probably the same distribution

1 1
stat = 2272712.000, p = 0.500
Probably the same distribution

1 2
stat = 2103873.500, p = 0.159
Probably the same distribution

1 3
stat = 5771337.000, p = 0.000
Probably different distributions

1 4
stat = 2273739.500, p = 0.008
Probably different distributions

1 5
stat = 2223272.000, p = 0.022
Probably different distributions

2 0
stat = 4652452.000, p = 0.234
Probably the same distribution

2 1
stat = 2103873.500, p = 0.159
Probably the same distribution

2 2
stat = 1998000.500, p = 0.500
Probably the same distributi

In [38]:
group_treat['FDEAD'].value_counts()

TREAT14                   FDEAD
Aspirin                   N        3692
                          Y         988
Aspirin + Low Heparin     N        1687
                          Y         438
Aspirin + Medium Heparin  N        1557
                          Y         436
Control                   N        4241
                          Y        1368
Low Heparin               N        1680
                          Y         519
Medium Heparin            N        1644
                          Y         494
Name: FDEAD, dtype: int64

In [39]:
treat6_a_n = group_treat['FDEAD'].value_counts()[0]
treat6_a_y = group_treat['FDEAD'].value_counts()[1]

treat6_alh_n = group_treat['FDEAD'].value_counts()[2]
treat6_alh_y = group_treat['FDEAD'].value_counts()[3]

treat6_amh_n = group_treat['FDEAD'].value_counts()[4]
treat6_amh_y = group_treat['FDEAD'].value_counts()[5]

treat6_c_n = group_treat['FDEAD'].value_counts()[6]
treat6_c_y = group_treat['FDEAD'].value_counts()[7]

treat6_lh_n = group_treat['FDEAD'].value_counts()[8]
treat6_lh_y = group_treat['FDEAD'].value_counts()[9]

treat6_mh_n = group_treat['FDEAD'].value_counts()[10]
treat6_mh_y = group_treat['FDEAD'].value_counts()[11]

In [43]:
print(f"--- Aspirin during study ---")
print(f'{round((treat6_a_n / (treat6_a_n + treat6_a_y) * 100),1)} % of patients living 6 months after their stroke')
print(f'-- Significantly different compared to both Heparin groups and the control group (p < 0.05)')
print()
print(f"--- Aspirin + Low Heparin during study ---")
print(f'{round((treat6_alh_n / (treat6_alh_n + treat6_alh_y) * 100),1)} % of patients living 6 months after their stroke')
print(f'-- Significantly different compared to both Heparin groups and the control group(p < 0.05)')
print()
print(f"--- Aspirin + Medium Heparin during study ---")
print(f'{round((treat6_amh_n / (treat6_amh_n + treat6_amh_y) * 100),1)} % of patients living 6 months after their stroke')
print(f'-- Significantly different compared to the control group (p < 0.05)')
print()
print(f"--- Low Heparin during study ---")
print(f'{round((treat6_lh_n / (treat6_lh_n + treat6_lh_y) * 100),1)} % of patients living 6 months after their stroke')
print()
print(f"--- Medium Heparin during study ---")
print(f'{round((treat6_mh_n / (treat6_mh_n + treat6_mh_y) * 100),1)} % of patients living 6 monthsafter their stroke')
print()
print(f"--- Control (neither Aspirin nor Heparin during study) ---")
print(f'{round((treat6_c_n / (treat6_c_n + treat6_c_y) * 100),1)} % of patients living 6 months after their stroke')
print()

--- Aspirin during study ---
78.9 % of patients living 6 months after their stroke
-- Significantly different compared to both Heparin groups and the control group (p < 0.05)

--- Aspirin + Low Heparin during study ---
79.4 % of patients living 6 months after their stroke
-- Significantly different compared to both Heparin groups and the control group(p < 0.05)

--- Aspirin + Medium Heparin during study ---
78.1 % of patients living 6 months after their stroke
-- Significantly different compared to the control group (p < 0.05)

--- Low Heparin during study ---
76.4 % of patients living 6 months after their stroke

--- Medium Heparin during study ---
76.9 % of patients living 6 monthsafter their stroke

--- Control (neither Aspirin nor Heparin during study) ---
75.6 % of patients living 6 months after their stroke

