In [21]:
import pandas as pd
import numpy as np

from scipy import linalg

In [22]:
from server import Server
from client import Client

In [23]:
MODE = 'Balanced'
# MODE = 'Imalanced'
MODE_TWO = MODE + '_nocov'
# MODE_TWO = MODE 

In [24]:
# defined by each participant
data_dir = './test_data/raw_files_first_' + MODE  # path to data folder


cohorts = ["lab_A", "lab_B", "lab_C", "lab_D", "lab_E"]

# defined by the coordinator
covariates = [] #['Pyr']  # covariates in linear model (use only one if there is two conditions)
# covariates = []
experiment_type = "DIA"  # if "TMT" then TMT data are expected and do additional checks, if "DIA" then DIA data are expected wothout additional checks.

In [25]:
# initialize the server
server = Server(covariates)
store_clients = {}

In [26]:
# clinets are joining
for cohort_name in cohorts:
    # matrix of intensities
    intensity_file_path = f"{data_dir}/{cohort_name}_protein_groups_matrix.tsv"
    # design matrix
    annotation_file_path = f"{data_dir}/{cohort_name}_design.tsv"

    client = Client(
        cohort_name,
        intensity_file_path,
        annotation_file_path,
        experiment_type
    )

    print(f"Client {client.cohort_name} is joining")
    print(f"Samples in {client.cohort_name} data: {len(client.sample_names)}")
    print(f"Protein groups in {client.cohort_name} data:  {len(client.prot_names)}")

    store_clients[client.cohort_name] = client
    # join client
    server.join_client(client)

print("===== Clients joined =====")
print("Client names:", server.client_names)
print("Samples per client:", server.n_samples_per_cli)
print("Covariates:", server.covariates)
print("Stored protein group names:", len(server.stored_features))
N = np.sum(server.n_samples_per_cli)  # total number of samples
print("Samples in total:", N)


# TMT prints:
# print("TMT-plexes per client:", server.n_tmt_per_cli)
# Ntmt = np.sum(server.n_tmt_per_cli)
# print("TMT-plexes in total:", Ntmt)

11-Jan-24 19:16:52 - root - INFO - Client lab_A: Log2(x+1) transformed intensities.
11-Jan-24 19:16:52 - root - INFO - Client lab_A: Loaded 24 samples and 2568 proteins.
11-Jan-24 19:16:52 - root - INFO - Server: joined client  lab_A
11-Jan-24 19:16:52 - root - INFO - Client lab_B: Log2(x+1) transformed intensities.
11-Jan-24 19:16:52 - root - INFO - Client lab_B: Loaded 13 samples and 2806 proteins.
11-Jan-24 19:16:52 - root - INFO - Server: joined client  lab_B
11-Jan-24 19:16:52 - root - INFO - Client lab_C: Log2(x+1) transformed intensities.
11-Jan-24 19:16:52 - root - INFO - Client lab_C: Loaded 14 samples and 2790 proteins.
11-Jan-24 19:16:52 - root - INFO - Server: joined client  lab_C
11-Jan-24 19:16:52 - root - INFO - Client lab_D: Log2(x+1) transformed intensities.
11-Jan-24 19:16:52 - root - INFO - Client lab_D: Loaded 15 samples and 2824 proteins.
11-Jan-24 19:16:52 - root - INFO - Server: joined client  lab_D
11-Jan-24 19:16:52 - root - INFO - Client lab_E: Log2(x+1) trans

Client lab_A is joining
Samples in lab_A data: 24
Protein groups in lab_A data:  2568
Client lab_B is joining
Samples in lab_B data: 13
Protein groups in lab_B data:  2806
Client lab_C is joining
Samples in lab_C data: 14
Protein groups in lab_C data:  2790
Client lab_D is joining
Samples in lab_D data: 15
Protein groups in lab_D data:  2824
Client lab_E is joining
Samples in lab_E data: 15
Protein groups in lab_E data:  2423
===== Clients joined =====
Client names: ['lab_A', 'lab_B', 'lab_C', 'lab_D', 'lab_E']
Samples per client: [24, 13, 14, 15, 15]
Covariates: []
Stored protein group names: 2300
Samples in total: 81


In [27]:
# # Exclude P31068 from the list server.stored_features
# server.stored_features.remove("P31068")


In [28]:
client.intensities.head(5)

Unnamed: 0_level_0,CVT09_QC3_LabE_X025,CVT09_s3_X002,CVT09_s10_X003,CVT09_s13_X004,CVT09_s16_X005,CVT09_s25_X006,CVT09_s28_X007,CVT09_s31_X008,CVT09_s36_X009,CVT09_s41_X010,CVT09_s59_X013,CVT09_s61_X014,CVT09_s73_X016,CVT09_s84_X018,CVT09_s93_X020
rowname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
P0ADA1,20.832956,20.70833,20.826531,20.688345,20.876203,20.682322,20.802885,20.797734,20.613864,20.857994,20.648859,20.539442,20.452319,20.554284,20.448845
P0A8D6,21.719072,21.63451,21.752018,21.591101,21.643421,21.662239,21.70394,21.745151,21.450477,21.476415,21.394004,21.47776,21.377105,21.525501,21.343467
P06993,19.283362,19.235112,19.138732,19.347161,19.089872,19.556955,19.27265,19.220116,19.235278,18.965302,19.639554,19.378284,19.544995,19.658079,19.686124
P00959,22.918466,23.050364,22.649941,23.035286,22.940258,22.888636,22.772447,22.762872,23.149398,23.092034,23.354017,23.340674,23.408598,23.378315,23.360194
P0AEQ1,20.028021,20.060272,20.220696,20.047682,20.039499,19.983555,20.029693,19.972,19.829458,19.811508,18.937029,19.125204,18.919943,18.846932,18.774008


In [29]:
# Cleaning inputs
for c in cohorts:
    client = store_clients[c]
    client.validate_inputs(server.stored_features, server.variables)
    # add cohort effect columns to each design matrix
    # add 1 column less than the number of cohorts
    client.create_design(server.client_names[:-1])

    print(f"Samples in {client.cohort_name} data: {len(client.sample_names)}")
    print(f"Protein groups in {client.cohort_name} data:  {len(client.prot_names)}")

# add cohort columns to the list of confounders on the server side
print(server.variables)
server.variables = ['intercept'] + server.client_names[:-1] + server.variables
print(server.variables)

11-Jan-24 19:16:52 - root - INFO - Client lab_A: Validated 24 samples and 2300 proteins.
11-Jan-24 19:16:52 - root - INFO - Client lab_A: Design matrix created.
11-Jan-24 19:16:52 - root - INFO - Client lab_A: Design matrix columns: ['intercept' 'lab_A' 'lab_B' 'lab_C' 'lab_D']
11-Jan-24 19:16:52 - root - INFO - Client lab_B: Validated 13 samples and 2300 proteins.
11-Jan-24 19:16:52 - root - INFO - Client lab_B: Design matrix created.
11-Jan-24 19:16:52 - root - INFO - Client lab_B: Design matrix columns: ['intercept' 'lab_A' 'lab_B' 'lab_C' 'lab_D']
11-Jan-24 19:16:52 - root - INFO - Client lab_C: Validated 14 samples and 2300 proteins.
11-Jan-24 19:16:52 - root - INFO - Client lab_C: Design matrix created.
11-Jan-24 19:16:52 - root - INFO - Client lab_C: Design matrix columns: ['intercept' 'lab_A' 'lab_B' 'lab_C' 'lab_D']
11-Jan-24 19:16:52 - root - INFO - Client lab_D: Validated 15 samples and 2300 proteins.
11-Jan-24 19:16:52 - root - INFO - Client lab_D: Design matrix created.
11

Samples in lab_A data: 24
Protein groups in lab_A data:  2300
Samples in lab_B data: 13
Protein groups in lab_B data:  2300
Samples in lab_C data: 14
Protein groups in lab_C data:  2300
Samples in lab_D data: 15
Protein groups in lab_D data:  2300
Samples in lab_E data: 15
Protein groups in lab_E data:  2300
[]
['intercept', 'lab_A', 'lab_B', 'lab_C', 'lab_D']


In [30]:
server.client_names[1:]

['lab_B', 'lab_C', 'lab_D', 'lab_E']

In [31]:
store_clients['lab_E'].design

Unnamed: 0_level_0,intercept,lab_A,lab_B,lab_C,lab_D
rowname,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CVT09_QC3_LabE_X025,1.0,-1,-1,-1,-1
CVT09_s3_X002,1.0,-1,-1,-1,-1
CVT09_s10_X003,1.0,-1,-1,-1,-1
CVT09_s13_X004,1.0,-1,-1,-1,-1
CVT09_s16_X005,1.0,-1,-1,-1,-1
CVT09_s25_X006,1.0,-1,-1,-1,-1
CVT09_s28_X007,1.0,-1,-1,-1,-1
CVT09_s31_X008,1.0,-1,-1,-1,-1
CVT09_s36_X009,1.0,-1,-1,-1,-1
CVT09_s41_X010,1.0,-1,-1,-1,-1


In [32]:
client.intensities.shape

(2300, 15)

In [33]:
# Keeping only shared proteins
for c in cohorts:
    client = store_clients[c]
    client.prot_names = server.stored_features
    client.intensities = client.intensities.loc[client.prot_names, :]
    #client.counts = client.counts.loc[client.prot_names, :]
    print(f"Samples in {client.cohort_name} data: {len(client.sample_names)}, protein groups: {len(client.prot_names)}")

Samples in lab_A data: 24, protein groups: 2300
Samples in lab_B data: 13, protein groups: 2300
Samples in lab_C data: 14, protein groups: 2300
Samples in lab_D data: 15, protein groups: 2300
Samples in lab_E data: 15, protein groups: 2300


## 6). Limma - removeBatchEffects

In [34]:
### 1) computes XtX, XtY, beta and stdev
XtX_list = []
XtY_list = []

server.n_samples_per_cli = []

In [35]:
for c in cohorts:
    client = store_clients[c]
    client.sample_names = client.design.index.values
    # sort intensities by sample names and proteins
    client.intensities = client.intensities.loc[client.prot_names, client.sample_names]
    client.n_samples = len(client.sample_names)
    
    XtX, XtY = client.compute_XtX_XtY()
    XtX_list.append(XtX)
    XtY_list.append(XtY)

    print(f"Client {client.cohort_name} has {client.n_samples} samples")
    print(XtX.shape)
    print(XtY.shape)

Client lab_A has 24 samples
(2300, 5, 5)
(2300, 5)
Client lab_B has 13 samples
(2300, 5, 5)
(2300, 5)
Client lab_C has 14 samples
(2300, 5, 5)
(2300, 5)
Client lab_D has 15 samples
(2300, 5, 5)
(2300, 5)
Client lab_E has 15 samples
(2300, 5, 5)
(2300, 5)


In [36]:
[(len(sublist), len(sublist[0])) if isinstance(sublist, list) else (len(sublist),) for sublist in XtX_list]

[(2300,), (2300,), (2300,), (2300,), (2300,)]

In [37]:
server.compute_beta_and_beta_stdev(XtX_list, XtY_list)

11-Jan-24 19:16:53 - root - INFO - Server: computing global beta and beta stdev, k = 5, n = 2300


## use beta (coefficients) to correct batch effects

In [38]:
server.beta

array([[17.85603314,  2.74016121, -5.89320207,  2.84647022,  1.43492513],
       [19.02495288,  4.58768995, -5.10249833,  2.86902293,  0.06741085],
       [25.32093529,  2.85266558, -5.10174982,  1.64930341,  0.91632211],
       ...,
       [18.48808868,  1.75966314, -4.91697978,  1.10717321, -0.34642961],
       [20.42682665,  3.7185432 , -2.58735193,  1.46178296,  0.41259323],
       [19.79164679,  2.09787317, -4.17860247,  2.19175616,  0.38571412]])

In [39]:
# pg_matrix - beta %*% t(batch)

for c in cohorts:
    client = store_clients[c]
    client.remove_batch_effects(server.beta)
    

11-Jan-24 19:16:53 - root - INFO - Client lab_A:	Batch effects removed.
11-Jan-24 19:16:53 - root - INFO - Client lab_B:	Batch effects removed.
11-Jan-24 19:16:53 - root - INFO - Client lab_C:	Batch effects removed.
11-Jan-24 19:16:53 - root - INFO - Client lab_D:	Batch effects removed.
11-Jan-24 19:16:53 - root - INFO - Client lab_E:	Batch effects removed.


In [40]:
for c in cohorts:
    client = store_clients[c]
    client.intensities_corrected.to_csv(f'results/{MODE_TWO}/{c}_intensities_corrected.tsv', sep='\t')