In [20]:
import pandas as pd
import numpy as np

from scipy import linalg

In [21]:
from server import Server
from client import Client

In [22]:
# defined by each participant
data_dir = 'test_data/raw_files'  # path to data folder
cohorts = ["lab_A", "lab_C", "lab_D", "lab_E"]


# defined by the coordinator
covariates = []  # covariates in linear model
experiment_type = "DIA"  # if "TMT" then TMT data are expected and do additional checks, if "DIA" then DIA data are expected wothout additional checks.

In [23]:
# initialize the server
server = Server(covariates)
store_clients = {}

In [24]:
# clinets are joining
for cohort_name in cohorts:
    # matrix of intensities
    intensity_file_path = f"{data_dir}/{cohort_name}_protein_groups_matrix.tsv"
    # design matrix
    #annotation_file_path = f"{data_dir}/{cohort_name}_design.tsv"

    client = Client(
        cohort_name,
        intensity_file_path,
        # annotation_file_path,
        experiment_type
    )

    print(f"Client {client.cohort_name} is joining")
    print(f"Samples in {client.cohort_name} data: {len(client.sample_names)}")
    print(f"Protein groups in {client.cohort_name} data:  {len(client.prot_names)}")

    store_clients[client.cohort_name] = client
    # join client
    server.join_client(client)

print("===== Clients joined =====")
print("Client names:", server.client_names)
print("Samples per client:", server.n_samples_per_cli)
#print("Target classes:", server.target_classes)
print("Covariates:", server.covariates)
print("Stored protein group names:", len(server.stored_features))
N = np.sum(server.n_samples_per_cli)  # total number of samples
print("Samples in total:", N)


# TMT prints:
# print("TMT-plexes per client:", server.n_tmt_per_cli)
# Ntmt = np.sum(server.n_tmt_per_cli)
# print("TMT-plexes in total:", Ntmt)

09-Aug-23 13:38:07 - root - INFO - Client lab_A: Log2(x+1) transformed intensities.
09-Aug-23 13:38:07 - root - INFO - Client lab_A: Loaded 28 samples and 2627 proteins.
09-Aug-23 13:38:07 - root - INFO - Server: joined client  lab_A
09-Aug-23 13:38:07 - root - INFO - Client lab_C: Log2(x+1) transformed intensities.
09-Aug-23 13:38:07 - root - INFO - Client lab_C: Loaded 23 samples and 2885 proteins.
09-Aug-23 13:38:07 - root - INFO - Server: joined client  lab_C
09-Aug-23 13:38:07 - root - INFO - Client lab_D: Log2(x+1) transformed intensities.
09-Aug-23 13:38:07 - root - INFO - Client lab_D: Loaded 24 samples and 2922 proteins.
09-Aug-23 13:38:07 - root - INFO - Server: joined client  lab_D
09-Aug-23 13:38:07 - root - INFO - Client lab_E: Log2(x+1) transformed intensities.
09-Aug-23 13:38:07 - root - INFO - Client lab_E: Loaded 24 samples and 2457 proteins.
09-Aug-23 13:38:07 - root - INFO - Server: joined client  lab_E


Client lab_A is joining
Samples in lab_A data: 28
Protein groups in lab_A data:  2627
Client lab_C is joining
Samples in lab_C data: 23
Protein groups in lab_C data:  2885
Client lab_D is joining
Samples in lab_D data: 24
Protein groups in lab_D data:  2922
Client lab_E is joining
Samples in lab_E data: 24
Protein groups in lab_E data:  2457
===== Clients joined =====
Client names: ['lab_A', 'lab_C', 'lab_D', 'lab_E']
Samples per client: [28, 23, 24, 24]
Covariates: []
Stored protein group names: 2336
Samples in total: 99


In [25]:
client.intensities.head(5)

Unnamed: 0_level_0,CVT09_QC1_LabE_X023,CVT09_QC2_LabE_X024,CVT09_QC3_LabE_X025,CVT09_QC4_LabE_X026,CVT09_s3_X002,CVT09_s10_X003,CVT09_s13_X004,CVT09_s16_X005,CVT09_s25_X006,CVT09_s28_X007,...,CVT09_s55_X012,CVT09_s59_X013,CVT09_s61_X014,CVT09_s67_X015,CVT09_s73_X016,CVT09_s77_X017,CVT09_s84_X018,CVT09_s90_X019,CVT09_s93_X020,CVT09_s99_X021
rowname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P0ADA1,20.530068,20.800517,20.832956,20.472747,20.70833,20.826531,20.688345,20.876203,20.682322,20.802885,...,20.557822,20.648859,20.539442,20.485333,20.452319,20.535641,20.554284,20.505225,20.448845,20.492617
P0A8D6,21.469675,21.58328,21.719072,21.281608,21.63451,21.752018,21.591101,21.643421,21.662239,21.70394,...,21.381643,21.394004,21.47776,21.278157,21.377105,21.382044,21.525501,21.327597,21.343467,21.494742
P06993,19.617815,19.208911,19.283362,19.727261,19.235112,19.138732,19.347161,19.089872,19.556955,19.27265,...,19.523981,19.639554,19.378284,19.511807,19.544995,19.767097,19.658079,19.772695,19.686124,19.769605
P00959,23.359671,22.980434,22.918466,23.366544,23.050364,22.649941,23.035286,22.940258,22.888636,22.772447,...,23.355228,23.354017,23.340674,23.430777,23.408598,23.392416,23.378315,23.429526,23.360194,23.374592
P0AEQ1,18.931439,20.103046,20.028021,18.638796,20.060272,20.220696,20.047682,20.039499,19.983555,20.029693,...,18.903261,18.937029,19.125204,18.791735,18.919943,19.001499,18.846932,18.860851,18.774008,18.949787


In [26]:
# Cleaning inputs
for c in cohorts:
    client = store_clients[c]
    client.validate_inputs(server.stored_features, server.variables)
    # add cohort effect columns to each design matrix
    # add 1 column less than the number of cohorts
    client.create_design(server.client_names[:-1])

    print(f"Samples in {client.cohort_name} data: {len(client.sample_names)}")
    print(f"Protein groups in {client.cohort_name} data:  {len(client.prot_names)}")

# add cohort columns to the list of confounders on the server side
print(server.variables)
#server.variables = server.variables + ['intercept'] + server.client_names[:-1]
server.variables = ['intercept'] + server.client_names[:-1]
print(server.variables)

09-Aug-23 13:38:07 - root - INFO - Client lab_A: Validated 28 samples and 2336 proteins.
09-Aug-23 13:38:07 - root - INFO - Client lab_C: Validated 23 samples and 2336 proteins.
09-Aug-23 13:38:07 - root - INFO - Client lab_D: Validated 24 samples and 2336 proteins.
09-Aug-23 13:38:07 - root - INFO - Client lab_E: Validated 24 samples and 2336 proteins.


Samples in lab_A data: 28
Protein groups in lab_A data:  2336
Samples in lab_C data: 23
Protein groups in lab_C data:  2336
Samples in lab_D data: 24
Protein groups in lab_D data:  2336
Samples in lab_E data: 24
Protein groups in lab_E data:  2336
[]
['intercept', 'lab_A', 'lab_C', 'lab_D']


In [27]:
server.client_names[1:]

['lab_C', 'lab_D', 'lab_E']

In [28]:
store_clients['lab_E'].design

Unnamed: 0,intercept,lab_A,lab_C,lab_D
CVT09_QC1_LabE_X023,1.0,-1,-1,-1
CVT09_QC2_LabE_X024,1.0,-1,-1,-1
CVT09_QC3_LabE_X025,1.0,-1,-1,-1
CVT09_QC4_LabE_X026,1.0,-1,-1,-1
CVT09_s3_X002,1.0,-1,-1,-1
CVT09_s10_X003,1.0,-1,-1,-1
CVT09_s13_X004,1.0,-1,-1,-1
CVT09_s16_X005,1.0,-1,-1,-1
CVT09_s25_X006,1.0,-1,-1,-1
CVT09_s28_X007,1.0,-1,-1,-1


In [29]:
client.intensities.shape

(2336, 24)

In [30]:
# Keeping only shared proteins
for c in cohorts:
    client = store_clients[c]
    client.prot_names = server.stored_features
    client.intensities = client.intensities.loc[client.prot_names, :]
    #client.counts = client.counts.loc[client.prot_names, :]
    print(f"Samples in {client.cohort_name} data: {len(client.sample_names)}, protein groups: {len(client.prot_names)}")

Samples in lab_A data: 28, protein groups: 2336
Samples in lab_C data: 23, protein groups: 2336
Samples in lab_D data: 24, protein groups: 2336
Samples in lab_E data: 24, protein groups: 2336


## 6). Limma - removeBatchEffects

In [31]:
### 1) computes XtX, XtY, beta and stdev
XtX_list = []
XtY_list = []

server.n_samples_per_cli = []

In [32]:
for c in cohorts:
    client = store_clients[c]
    client.sample_names = client.design.index.values
    # sort intensities by sample names and proteins
    client.intensities = client.intensities.loc[client.prot_names, client.sample_names]
    client.n_samples = len(client.sample_names)
    
    XtX, XtY = client.compute_XtX_XtY()
    XtX_list.append(XtX)
    XtY_list.append(XtY)

    print(f"Client {client.cohort_name} has {client.n_samples} samples")
    print(XtX.shape)
    print(XtY.shape)

Client lab_A has 28 samples
(2336, 4, 4)
(2336, 4)
Client lab_C has 23 samples
(2336, 4, 4)
(2336, 4)
Client lab_D has 24 samples
(2336, 4, 4)
(2336, 4)
Client lab_E has 24 samples
(2336, 4, 4)
(2336, 4)


In [33]:
[(len(sublist), len(sublist[0])) if isinstance(sublist, list) else (len(sublist),) for sublist in XtX_list]

[(2336,), (2336,), (2336,), (2336,)]

In [34]:
server.compute_beta_and_beta_stdev(XtX_list, XtY_list)

09-Aug-23 13:38:07 - root - INFO - Server: computing global beta and beta stdev, k = 4, n = 2336


## use beta (coefficients) to correct batch effects

In [36]:
server.beta

array([[19.21481313,  1.36178262,  1.00308466,  0.16579205],
       [20.19725987,  3.41286279,  1.34762323, -1.01484329],
       [26.61740886,  1.5563936 ,  0.42833792, -0.4138365 ],
       ...,
       [19.6274467 ,  0.6382953 , -0.53304546, -1.20620506],
       [21.0365222 ,  3.11074624,  0.63488748, -0.07906553],
       [20.85901828,  1.06116665,  1.14985227, -0.68655987]])

In [37]:
# pg_matrix - beta %*% t(batch)

for c in cohorts:
    client = store_clients[c]
    client.remove_batch_effects(server.beta[:,1:])
    

09-Aug-23 13:38:07 - root - INFO - Client lab_A:	Batch effects removed.
09-Aug-23 13:38:07 - root - INFO - Client lab_C:	Batch effects removed.
09-Aug-23 13:38:07 - root - INFO - Client lab_D:	Batch effects removed.
09-Aug-23 13:38:07 - root - INFO - Client lab_E:	Batch effects removed.


In [38]:
for c in cohorts:
    client = store_clients[c]
    client.intensities_corrected.to_csv(f'results/{c}_intensities_corrected.tsv', sep='\t')