In [2]:
import numpy as np
import pandas as pd
from sksurv.datasets import load_whas500
from sksurv.linear_model import CoxPHSurvivalAnalysis
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
X, y = load_whas500()
X = X.astype(float)
# Combining features and events is easier to work with for now
combined = pd.concat([X, pd.DataFrame(y)], axis=1)
combined['lenfol'] = combined['lenfol'].astype(int)
TARGET_COLUMNS = ['fstat', 'lenfol']



right_censored = np.array([el[0] for el in y])
event_times = np.array([el[1] for el in y]).astype(int)

# Constructing the components
In order to solve equation 8we need to filter and group the data


## $D_t$
We need to group the records on event time, ignore the right-censored records

Then we get $D_t$ for every $t$ from $t=1$ to $T$

In [4]:
# First ignore all right-censored records
dt = combined[~combined['fstat']]

# We don't need the censor column anymore
dt = dt.drop(['fstat'], axis=1)

# Group on event time
dt = dt.groupby('lenfol')

dt.describe().head()

Unnamed: 0_level_0,afb,afb,afb,afb,afb,afb,afb,afb,age,age,...,sho,sho,sysbp,sysbp,sysbp,sysbp,sysbp,sysbp,sysbp,sysbp
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
lenfol,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
368,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,46.0,...,0.0,0.0,1.0,149.0,,149.0,149.0,149.0,149.0,149.0
371,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,73.333333,...,0.0,0.0,3.0,132.333333,18.610033,115.0,122.5,130.0,141.0,152.0
373,1.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,65.0,...,0.0,0.0,1.0,164.0,,164.0,164.0,164.0,164.0,164.0
376,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,60.0,...,0.0,0.0,2.0,195.0,22.627417,179.0,187.0,195.0,203.0,211.0
386,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,55.5,...,0.0,0.0,2.0,141.5,34.648232,117.0,129.25,141.5,153.75,166.0


## $R_t$
$R_t$ denotes the set of samples at risk of the event at time $t$. This includes samples with an event at time t, the samples with an event later than time t, and right-censored samples.

*I __think__ that I can treat right-censored samples the same as regular samples for this set.*

In [5]:
rt = combined

# I will create a bucket per unique lenfol and create a new dataframe per bucket with all samples at risk at that time
unique_times = rt['lenfol'].unique()

num_unique_times = len(unique_times)


In [6]:
np.unique(np.arange(4))

array([0, 1, 2, 3])

In [8]:
def group_samples_at_risk(event_times: np.array):
    """
    Groups the indices of samples on whether they are at risk at a certain time.
    
    A sample is at risk at a certain time when its event time is greater or equal that time.
    """
    unique_times = np.unique(event_times)
    
    grouped = {}
    
    for t in unique_times:
        grouped[t] = np.argwhere(event_times>= t)
    
    return grouped

Rt = group_samples_at_risk(event_times)


def test_group_samples_at_risk_numbers_descend():
    # Testing if the resulting list descends in numbers
    previous_length = len(event_times) + 1

    for t in sorted(Rt.keys()):
        length = len(Rt[t])

        assert length < previous_length

        previous_length = length
        
test_group_samples_at_risk_numbers_descend()

## $\sum \limits_{t=1}^{T} \sum \limits_{n \in D_t} \mathbf{x}_{nk}$

$D_t$ is the list of indices with an observed event at time $t$.

This part seems to be constant throughout the optimization?

I think this is just a big fat sum of all the patients' covariants. It will stay constant per institution.


In [9]:
covariates_sum = combined.drop(TARGET_COLUMNS, axis=1).values.sum(axis=0)

covariates_sum


array([7.800000e+01, 3.492300e+04, 1.100000e+01, 1.330689e+04,
       1.550000e+02, 3.750000e+02, 3.913300e+04, 2.000000e+02,
       4.350900e+04, 3.058000e+03, 1.710000e+02, 1.530000e+02,
       2.200000e+01, 7.235200e+04])

In [10]:
# Covariates
X.values

array([[  1.,  83.,   0., ...,   0.,   0., 152.],
       [  0.,  49.,   0., ...,   1.,   0., 120.],
       [  0.,  70.,   0., ...,   1.,   0., 147.],
       ...,
       [  1.,  57.,   0., ...,   0.,   0., 120.],
       [  0.,  67.,   0., ...,   1.,   0., 112.],
       [  0.,  98.,   0., ...,   1.,   0., 160.]])

## Local update

$ \beta_k^{(p)} = \bigg[ \rho \sum \limits_{n=1}^{N} \mathbf{x}_{nk}\mathbf{x}_{nk}^T\bigg]^{-1} \cdot \bigg[\sum \limits_{n=1}^N  (\rho z_{nk}^{(p-1)} - \gamma_{nk}^{p-1}) \mathbf{x}_{nk} + \sum \limits_{t=1}^T \sum \limits_{n \in D_t} \mathbf{x}_{nk}\bigg] $

There are two parts to this computation that seem to be constant over iterations:
1. $\rho \sum \limits_{n=1}^{N} \mathbf{x}_{nk}\mathbf{x}_{nk}^T$
2. $\sum \limits_{t=1}^T \sum \limits_{n \in D_t} \mathbf{x}_{nk}$

Number 2. is also the part where we need to apply the scalar product protocol.

In [22]:
# Local update
RHO = 0.25

# Parts that stay constant over iterations
# Square all covariates and sum them together
# The formula says for every patient, x needs to be multiplied by itself.
# Squaring all covariates with themselves comes down to the same thing since x_nk is supposed to
# be one-dimensional
multiplied_covariates = (X* X.transpose()).sum(axis=0)
covariates_summed = combined.drop(TARGET_COLUMNS, axis=1).values.sum(axis=0)

def sum_covariates(covariates: np.array):
    return np.sum(covariates, axis=0)
    
def multiply_covariates(covariates: np.array):
    return (X* X.transpose()).sum(axis=0)

def local_update(covariates:np.array, z:np.array, gamma:np.array, rho,
                 multiplied_covariates, covariates_sum):
    
    first_component = 1/(rho * multiplied_covariates)
    
    pz = np.multiply(rho, z)
    
    second_component = np.multiply(pz - gamma, covariates) + covariates_sum    
    
    return first_component * second_component

def test_sum_covariates_returns_scalar():
    num_patients = 2
    num_features = 2
    
    covariates = np.arange(num_patients * num_features).reshape((num_patients, num_features))
    
    result = sum_covariates(covariates)
    assert np.isscalar(result)


def test_local_update():
    num_patients = 2
    num_features = 2
    
    rho=1
    covariates = np.arange(num_patients*num_features).reshape((num_patients, num_features))
    z = np.arange(num_patients)
    gamma = np.arange(num_patients)
    multiplied_cov = multiply_covariates(covariates)
    summed_cov = sum_covariates(covariates)
    
    updated = local_update(covariates, z, gamma, rho, multiplied_cov, summed_cov)
    
    assert updated.shape() == (1,)

test_local_update()
np.arange()

ValueError: operands could not be broadcast together with shapes (514,) (2,2) 

## Server update
- Server computes:
    - $\overline{\sigma}_n^{(p)} = \sum \limits_{k=1}^K \sigma_{nk}^{(p)}/K $
    - $\overline{\gamma}_{n}^{(p)} = \sum \limits_{k=1}^K \gamma_{nk}^{(p)}/K $
- Server computes $\overline{z}^{(p)}$ by applying Newton-Raphson to:
$ \sum_{t=1}^T \left[d_t log \sum \limits_{j \in R_t} exp(K \overline{z}_j) \right] + K \rho \sum \limits_{n=1}^N \left[ \frac{\overline{z}_n^2}{2} - 
\left( \overline{\sigma}_n^{(p)} + \frac{\overline{\gamma}_n^{(p-1)}}{\rho} \right) \overline{z}_n \right]    $

### Person-level auxiliary variables
For the update the server makes use of the auxiliary variables $\overline{\sigma}$ and $\overline{\gamma}$. The elements of these vectors have a one-on-one relationship with the patients.

Moreover, the server tries to find a variable $\overline{z}$ which not only has a one-on-one relationship with the patients, but also needs to be grouped based on patients' event times.

In [29]:
K = 1 #Number of institutions
dt = num_unique_times # Number of unique event times


def L_z(z: np.array, K: int, gamma_old:np.array, sigma, rho):
    
    Rt = group_samples_at_risk(z)
    
    component1 = L_z_component1(z, Rt)
    component2 = L_z_component2(z, K, sigma, gamma_old, rho)
    
    return component1 + component2
        
def L_z_component1(z, Rt):
    result = 0
    for t, group in z_samples_at_risk.items()
        result += dt * (K * np.exp(group)).sum()
    
    return result
        
def L_z_component2(z, K, sigma, gamma_old, rho):
    # TODO: check why we use gamma_old
    element_wise = np.square(z)/2 - sigma + (gamm_old/rho) * z
    return K * rho * element_wise.sum()
    

# Test if the output type is as expected (should be a scalar)

def test_lz_outputs_scalar():
    # Data with two samples
    columns = ['lenfol', 'z', 'gamma', 'sigma', 'rho']
    data = np.arange((2, len(columns)))
    
    samples = pd.DataFrame(data=data, columns=columns)
    
    
    

SyntaxError: invalid syntax (3783140394.py, line 5)

In [28]:

np.array([1,2,3]) * 5

array([ 5, 10, 15])

In [35]:
def (a:np.array):
    return a * a.transpose()



SyntaxError: invalid syntax (1779160323.py, line 1)

## Risks
### Differential privacy-ish
If the difference between $D_t$ and $D_{t+1}$, and similarly, the difference between $R_t$ and $R_{t+1}$ is too small, there is a great risk of data leakage. This needs to be addressed.

### "Gradient" leakage
The central server computes a variable $\boldsymbol{\overline{z}}$ which is a vector where every element corresponds to an individual patient.