In [5]:
library(tidyverse)
source("../../evaluation_utils/preprocessing_MQ/preprocessing_report.R")

# MQ files

In [9]:
# REVIEWED
data_path = '/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/'

In [10]:
# Metadtata

metadata <- read.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/Metadata_CosyBio.tsv",
                       header = TRUE, sep = "\t", stringsAsFactors = FALSE)
                       
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(" ", ".", Quantitative.column.name))

rownames(metadata) <- metadata$Quantitative.column.name
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(".Pool", ".P_", gsub("Reporter.intensity.corrected.", "RIC_", Quantitative.column.name)))


# remove the outlier
metadata <- metadata %>%
            filter(Quantitative.column.name != "RIC_3.P_3")


# rename the columns - put the .P_[1-9]+ after dot before and the rest after (e.g. RIC_1.P_1 -> P_1.RIC_1)
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub("RIC_([0-9]+).P_([0-9]+)", "P_\\2.RIC_\\1", Quantitative.column.name))



head(metadata, 3)
dim(metadata)

Unnamed: 0_level_0,Quantitative.column.name,Pool,Reporter.ion,Patient,Group,Center
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Reporter.intensity.corrected.1.Pool1,P_1.RIC_1,Pool1,126,Common Reference,Common Reference,Center1
Reporter.intensity.corrected.2.Pool1,P_1.RIC_2,Pool1,127N,heathy1,heathy,Center1
Reporter.intensity.corrected.3.Pool1,P_1.RIC_3,Pool1,127C,FSGS1,FSGS,Center1


In [11]:
# List of input files
list_of_inputs <- list(
    "Center1" = paste0(data_path, "Center1/proteinGroups.txt"),
    "Center2" = paste0(data_path, "Center2/proteinGroups.txt"),
    "Center3" = paste0(data_path, "Center3/proteinGroups.txt")
)

# Initialize empty lists to store combined data
combined_pg_intensities <- list()

# Iterate over each center
for(center in names(list_of_inputs)) {
    # Determine the center folder based on the center name
    center_folder <- ifelse(center == "Center1", "center_one", ifelse(center == "Center2", "center_two", "center_three"))

    # Filter metadata for the current center
    center_metadata <- metadata %>%
        filter(Center == center)

    # Preprocess the MaxQuant output file for the current center
    results_list <- preprocess_data_mxout(list_of_inputs[[center]], center_metadata, data_type='protein')
    pg_intensities <- results_list[[1]]
    combined_pg_intensities <- c(combined_pg_intensities, list(pg_intensities))
}

# Add names to the combined lists
names(combined_pg_intensities) <- names(list_of_inputs)

Processed data count: 500 
Counts data count: 500 
Processed data count: 530 
Counts data count: 530 
Processed data count: 456 
Counts data count: 456 


In [19]:
pg_output_path <- "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/mapping_filtering_check/"

# This loop iterates over a list of centers and performs data preparation tasks for each center.
for(center in list("Center1", "Center2", "Center3")) {
    # Summarize gene names
    genes_intensities <- combined_pg_intensities[[center]] %>%
        select(Majority.protein.IDs)
    cat("For", center, "center has", nrow(genes_intensities), "rows.\n")

    write.table(genes_intensities, file = paste0(pg_output_path, center, "MajorID_MQ_notREV.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
}

For Center1 center has 500 rows.
For Center2 center has 530 rows.
For Center3 center has 456 rows.


# Custom mapping PG

In [21]:
list_int <- list()

for(center in list("Center1", "Center2", "Center3")) {
    intensities <- read.csv(paste0('/home/yuliya/repos/cosybio/FedProt/data/TMT_data/mapping_filtering_check/custom_NR/', center, "_intensities_counts_ALL.tsv"), sep = "\t", header = TRUE, stringsAsFactors = FALSE)
    list_int[[center]] <- intensities
    cat("Intensities data frame for", center, "center has", nrow(intensities), "rows and", ncol(intensities), "columns.\n")

    # filter out rows with sum for row in all column== 0

    major <- intensities %>%
        select('major_features') %>%
        unique()
    cat("Center", center, "center has", nrow(major), "rows.\n")
    write.table(major, file = paste0(pg_output_path, center, "Major_Custom_NotREV.tsv"), sep = "\t", quote = FALSE, row.names = FALSE)
}

Center Center1 center has 571 rows.
Center Center2 center has 587 rows.
Center Center3 center has 506 rows.


In [22]:
intensities

major_features,Reporter.intensity.corrected.1.Pool4,Reporter.intensity.corrected.2.Pool4,Reporter.intensity.corrected.3.Pool4,Reporter.intensity.corrected.4.Pool4,Reporter.intensity.corrected.5.Pool4,Reporter.intensity.corrected.6.Pool4,Reporter.intensity.corrected.7.Pool4,Reporter.intensity.corrected.8.Pool4,Reporter.intensity.corrected.9.Pool4,⋯,Reporter.intensity.corrected.2.Pool6,Reporter.intensity.corrected.3.Pool6,Reporter.intensity.corrected.4.Pool6,Reporter.intensity.corrected.5.Pool6,Reporter.intensity.corrected.6.Pool6,Reporter.intensity.corrected.7.Pool6,Reporter.intensity.corrected.8.Pool6,Reporter.intensity.corrected.9.Pool6,Reporter.intensity.corrected.10.Pool6,Reporter.intensity.corrected.11.Pool6
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
A0A024R6N5;A0A0G2JRN3,91482.0,39615.0,43587.0,80451.0,133126.0,31849.1,88389.0,306645.0,131622.0,⋯,82668.0,75290.0,77846.0,84483.0,91495.0,103071.0,176439.0,1544250.0,92815.0,87082.0
A0A075B6H9,34847.0,87251.0,39394.0,40341.0,42727.0,51288.0,37853.0,72286.0,42068.0,⋯,40876.0,50691.0,58515.0,32968.0,38083.0,37138.0,35395.0,51815.0,52347.0,35346.0
A0A075B6I9;P04211,101487.0,169956.0,61382.0,83597.0,86418.0,121094.0,83259.0,119262.0,61073.0,⋯,83664.0,125190.0,65350.0,80610.0,75906.0,76086.0,87507.0,73270.0,109490.0,80759.0
A0A075B6K4,165435.0,135019.0,60692.0,118939.0,79944.0,137152.0,67529.0,103586.0,69481.0,⋯,103082.0,130410.0,122658.0,90457.0,76299.0,91018.0,114078.0,124654.0,197433.0,118255.0
A0A075B6K5,62151.0,79009.0,47033.0,40289.0,66940.0,58044.0,41581.0,69596.0,35667.0,⋯,10365.0,16831.0,13097.0,15632.0,5153.7,7377.6,13434.0,19887.0,8440.7,14365.0
A0A075B6P5;P01614;P01615;A0A087WW87,43519.0,93775.0,23943.0,36852.0,49822.0,56989.0,22098.0,56844.0,35024.0,⋯,47550.0,40188.0,39156.0,34402.0,27116.0,11973.0,24434.0,45933.0,57963.0,61653.0
A0A075B7B8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,⋯,132510.0,92115.0,123300.0,99677.0,56267.0,36548.0,114850.0,110080.0,119930.0,86249.0
A0A087WSY5;Q96IY4,1890677.6,2150562.8,1777006.2,2048077.4,1734344.8,1838722.6,1879146.1,1936531.1,2338963.2,⋯,3519982.0,2678659.0,3817684.0,3037822.0,2974400.0,2381316.0,3246423.0,2889761.0,2724804.0,2399720.0
A0A087WT59;P02766,2052243.3,2000358.3,1467285.5,2638505.0,2682104.0,1976855.6,2351462.2,1806391.3,2608162.0,⋯,4171629.0,3261998.0,3379509.0,2771405.0,2801421.0,2486453.1,3058748.0,6321644.0,3602942.0,3145210.0
A0A087X0M8;O00533,424923.8,509312.0,614248.6,451175.5,515825.4,456606.4,577095.4,405852.7,399683.3,⋯,1079225.6,882039.4,1001795.4,851708.0,705370.0,962615.3,748616.8,749934.9,769305.5,765654.5
