In [None]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sstats
import scipy as sp

#Task 1)

The way we sample from continuous distribution is to have a 3D tuple, with

$(X_{t_i),T_i,t_i)$ where $t_i$ is the time spent in state $X_i$ and $T_i$ is the time at which we enter state $X_i$.


In [None]:
Q_cunt = np.array([
    [-0.0085,0.005,0.0025,0,0.001],
    [0,-0.014,0.005,0.004,0.005],
    [0,0,-0.008,0.003,0.005],
    [0,0,0,-0.009,0.009],
    [0,0,0,0,0]])

x_start = 0
nrSamples = 10000


def continuousMarkovSampling(nrSamples,Q,X_end = 4,x_start = 0):
    CMC_samples = []

    for i in range(nrSamples):
        T = 0
        Xi = x_start
        sample = []

        while(Xi != X_end):
            P_current = Q[Xi]
            t_sojourn = np.random.exponential(-1*(1/P_current[Xi]))
            sample += [(Xi,T,t_sojourn)]
            T += t_sojourn
            #Pick the next state
            Xi = np.random.choice([i for i in range(Xi+1,5)],p=-1*P_current[Xi+1:]/P_current[Xi])


        #Add the last sample - makes data processing easier later
        P_current = Q[Xi]
        t_sojourn = np.exp(P_current[Xi])
        sample += [(Xi,T,t_sojourn)]

        CMC_samples += [sample]

    return CMC_samples

samples = continuousMarkovSampling(nrSamples,Q_cunt,4,0)

In [None]:


lifetimes = [sample[-1][1] for sample in samples]
plt.hist(lifetimes,bins=100)

Since we do not know the true standard deviation of our sample, we use the t-distribution to create a confidence interval.

# TODO: Må vi bare gerne bruge t-test til std også?

In [None]:
import scipy.stats as sstats

X_bar = np.mean(lifetimes)
sigma_bar =  1/(nrSamples-1)*np.sum((lifetimes-X_bar)**2)
s = np.sqrt(sigma_bar)


def t_conf(n,test_stat):
    df = n - 1
    t_constant = sstats.t.ppf(1-0.025,df)
    return np.array([-1,1])*t_constant*(s/np.sqrt(df+1)) + test_stat




print(f"This is the C.I for the mean: {t_conf(len(lifetimes),X_bar)}")
print(f"This is the C.I for the std: {t_conf(len(lifetimes),s)}")

In [None]:
#Boolean: Stage 3 but less than death, (Time start + time sojourn) > 30.50
cancer_dist_305 = [woman for woman in samples if len([T for (Xi,T,ti) in woman if ((Xi >= 2 and Xi<4) and ((T+ti)>=30.5))])>0]
print(f"Proportion of women who experience long distance cancer after the 30.5th month: {len(cancer_dist_305)/len(samples)}")

# Task 8

We know the parameters of the distribution (Qs and p0), furthermore we get the theoretical CDF of the lifetime distribution.

We therefore decide to do a Kolomogorov-Smirnov test, with a $\alpha = 0.5$ level of significance.

That is; We must have $D_n <= 1.358$ in order to not forkaste the hypothesis.

# TODO: Hvorfor er det, at jo højere en confidence vi vil have, jo højere en critical value får vi? Altså K_{95}<=K_{97}

In [None]:
Qs = Q_cunt[:4,:4]
p0 = [1,0,0,0]
F = lambda t: 1 - np.dot(p0,(sp.linalg.expm(Qs*t)@np.ones(4)))

In [None]:
(-np.array([1,0,0,0])@np.linalg.inv(Qs)).sum() #Real mean

In [None]:
F_e = lambda x: len(np.where(np.array(lifetimes) <= x)[0])/len(lifetimes)

In [None]:
Dn = np.max([np.abs(F_e(x) - F(x)) for x in lifetimes])
n = len(lifetimes)
Dn = (np.sqrt(n) + 0.12 + 0.11/np.sqrt(n))*Dn
print(f"We get that we accept the null hypothesis, that our samples follow the given phase distribution, due to the fact, that our test statistic is below the critical value of 1.358, specifically we get: {Dn}")

# Task 9

We reuse the empirial CDF from before.

Based solely off of the graph of the two survival functions, it does appear, that in general there is a higher probability of surviving using the preventative treatment. That is $P(T_{prev} \geq x) \geq P(T \geq x)$.

However, it doesn't tell the full story; It might be, that we cut the lives of those that live a short time shorter by some percentage, and extend the lives of those that live a long time by a higher percentage, resulting in an overall increase, but clearly not an absolute positive in the sense of saving lives.

In [None]:
N = len(lifetimes)
d = lambda x,data: len(np.where(np.array(data) <= x)[0])
S = lambda t,data: (len(data)-d(t,data))/len(data)

In [None]:
Q_prvnt = np.array([[0,0.0025,0.00125,0,0.001],
          [0,0,0,0.002,0.005],
          [0,0,0,0.003,0.005],
          [0,0,0,0,0.009],
          [0,0,0,0,0]])

for i in range(5):
    Q_prvnt[i][i] = -1*np.sum(Q_prvnt[i])

print(Q_prvnt)

samples_prvnt = continuousMarkovSampling(1000,Q_prvnt)
samples_1000 = continuousMarkovSampling(1000,Q_cunt)

lifetimes_prvnt = np.array([sample[-1][1] for sample in samples_prvnt])
lifetimes_1000 = np.array([sample[-1][1] for sample in samples_1000])
print("")


x_values_plot = np.linspace(0,1400,14000)

[S(x,lifetimes_prvnt) for x in np.linspace(0,1400,14000)]
plt.plot(x_values_plot,[S(x,lifetimes_prvnt) for x in x_values_plot],label="preventive survival")
plt.plot(x_values_plot,[S(x,lifetimes) for x in x_values_plot],label="non-preventive survival")

# Adding title and labels
plt.title("Survival Curves")
plt.xlabel("Time")
plt.ylabel("Survival Probability")

# Adding legends
plt.legend()

# Task 10 Optional

As I've understood it, what we end up with is a test statistic that can be delimited using a confidence interval based off of the standard normal distribution.

We can then compare the confidence interval for the two treatments to see, if they overlap.

In [None]:
N_ij = lambda j,lifetime_data: len([lifetime for lifetime in lifetime_data if lifetime > j])
O_ij = lambda _N,lifetime_data: len(lifetime_data) - _N

In [446]:
s12 = [np.sort(lifetimes_prvnt),np.sort(lifetimes)]
Z = []

for i in range(2):
    Zi = 0

    for j in s12[i]:
        Nij = N_ij(j,s12[i])
        Nkj = N_ij(j,s12[i-1])
        Nj = Nij + Nkj

        Oij = O_ij(Nij,s12[i])
        Okj = O_ij(Nkj,s12[i])
        Oj = Oij + Okj



        if(Nj >1):
            Eij = Oj*(Nij/Nj)
            Vij = Eij*((Nj-Oj)/Nj)*((Nj-Nij)/(Nj-1))
            Zi += (Oij - Eij)/np.sqrt(Vij)

    Z += [Zi]

  Zi += (Oij - Eij)/np.sqrt(Vij)
  Zi += (Oij - Eij)/np.sqrt(Vij)


In [445]:
Z[0]

nan