In [2]:
library(tidyverse)
library(cowplot)
library(VennDiagram)
library(gridExtra)

# Read files

In [3]:
metadata <- read.table("/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/Metadata_CosyBio.tsv",
                       header = TRUE, sep = "\t", stringsAsFactors = FALSE)
                       
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(" ", ".", Quantitative.column.name))

rownames(metadata) <- metadata$Quantitative.column.name
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub(".Pool", ".P_", gsub("Reporter.intensity.corrected.", "RIC_", Quantitative.column.name)))


# remove the outlier
metadata <- metadata %>%
            filter(Quantitative.column.name != "RIC_3.P_3")


# rename the columns - put the .P_[1-9]+ after dot before and the rest after (e.g. RIC_1.P_1 -> P_1.RIC_1)
metadata <- metadata %>%
            mutate(Quantitative.column.name = gsub("RIC_([0-9]+).P_([0-9]+)", "P_\\2.RIC_\\1", Quantitative.column.name))


head(metadata, 3)
dim(metadata)

Unnamed: 0_level_0,Quantitative.column.name,Pool,Reporter.ion,Patient,Group,Center
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
Reporter.intensity.corrected.1.Pool1,P_1.RIC_1,Pool1,126,Common Reference,Common Reference,Center1
Reporter.intensity.corrected.2.Pool1,P_1.RIC_2,Pool1,127N,heathy1,heathy,Center1
Reporter.intensity.corrected.3.Pool1,P_1.RIC_3,Pool1,127C,FSGS1,FSGS,Center1


In [4]:
pgnames_mxout <- function(path, center_metadata){
    PG_report <- read.table(
        path,
        header=T,
        sep="\t",
        stringsAsFactors = F)
    
    # remove decoy matches and matches to contaminant
    PG_report <- PG_report[!PG_report$Reverse=="+",]
    PG_report <- PG_report[!PG_report$Potential.contaminant=="+",]
    # if the PG_report$Only.identified.by.site is not all NA and contains "+" values - remove this rows
    if(!all(is.na(PG_report$Only.identified.by.site))) {
        # # Only.identified.by.site - exclude proteins that were only identified by a modification site
        PG_report <- PG_report[!PG_report$Only.identified.by.site=="+",]
    }

    # take only the columns that we need
    pg_intensities <- PG_report %>%
        select(c("Majority.protein.IDs", "Fasta.headers", rownames(center_metadata))) %>%
        mutate(Fasta.headers = ifelse(Fasta.headers == "", Majority.protein.IDs, Fasta.headers)) %>%
        select(-Majority.protein.IDs)

    pg_intensities <- pg_intensities[, c("Fasta.headers", rownames(center_metadata))]
    colnames(pg_intensities) <- c("Fasta.headers", center_metadata$Quantitative.column.name)

    # counts
    pg_counts <- PG_report %>%
        select(c("Majority.protein.IDs", "Fasta.headers", "Razor...unique.peptides")) %>%
        mutate(Fasta.headers = ifelse(Fasta.headers == "", Majority.protein.IDs, Fasta.headers)) %>%
        select(-Majority.protein.IDs)

    return(list(pg_intensities, pg_counts))    
}

In [5]:
list_of_outputs <- list(
    "Center1" = "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/proteinGroups_center1.txt",
    "Center2" = "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/proteinGroups_center2.txt",
    "Center3" = "/home/yuliya/repos/cosybio/FedProt/data/TMT_data/raw_MaxQuant_reports/proteinGroups_center3.txt"
)

combined_pg_intensities <- list()
combined_counts_intensities <- list()

for(center in names(list_of_outputs)) {

    center_metadata <- metadata %>% filter(Center == center)

    center_folder <- ifelse(center == "Center1", "center_one", ifelse(center == "Center2", "center_two", "center_three"))
    results_list <- pgnames_mxout(list_of_outputs[[center]], center_metadata)

    pg_intensities <- results_list[[1]]
    counts_df <- results_list[[2]]

    # add it to the list using the center name as the key
    combined_pg_intensities <- c(combined_pg_intensities, list(pg_intensities))
    combined_counts_intensities <- c(combined_counts_intensities, list(counts_df))
    
}
# add names to the list
names(combined_pg_intensities) <- names(list_of_outputs)
names(combined_counts_intensities) <- names(list_of_outputs)
# head(combined_pg_intensities)
colnames(combined_pg_intensities[[1]])

### summarize the rows in case of same names

In [6]:
combined_pg_intensities_filteres <- combined_pg_intensities

for(center in names(list_of_outputs)) {

    # transform Fasta.headers into unique collapsed list
    pg <- combined_pg_intensities[[center]]
    pg$Fasta.headers <-sapply(pg$Fasta.headers, function(x) unique(unlist(strsplit(x, ";"))))
    pg$Fasta.headers <- sapply(pg$Fasta.headers, function(x) x[x != ""])
    pg$Fasta.headers <- pg$Fasta.headers %>%
        sapply(function(x) paste(x, collapse = ";")) %>% as.character()


    # summ rows with the same Fasta.headers
    pg <- pg %>%
        group_by(Fasta.headers) %>%
        summarise(across(everything(), sum), .groups = "drop")
    
    combined_pg_intensities_filteres[[center]] <- pg

}


# same for counts

combined_counts_intensities_filteres <- combined_counts_intensities

for(center in names(list_of_outputs)) {

    # transform Fasta.headers into unique collapsed list
    pg <- combined_counts_intensities[[center]]
    pg$Fasta.headers <-sapply(pg$Fasta.headers, function(x) unique(unlist(strsplit(x, ";"))))
    pg$Fasta.headers <- sapply(pg$Fasta.headers, function(x) x[x != ""])
    pg$Fasta.headers <- pg$Fasta.headers %>%
        sapply(function(x) paste(x, collapse = ";")) %>% as.character()

    
    # summ rows with the same Fasta.headers
    pg <- pg %>%
        group_by(Fasta.headers) %>%
        summarise(across(everything(), sum), .groups = "drop")

    combined_counts_intensities_filteres[[center]] <- pg

}

### filter out rows if a row contains 0 in any of "Common reference" columns

In [7]:
# filter out rows if the is 0 in any of "Common reference" columns
# the colnames for Common reference are in metadata

for(center in names(list_of_outputs)) {
    print(center)
    pg <- combined_pg_intensities_filteres[[center]]
    print(dim(pg))
    ref_colnames <- metadata %>% filter(Center == center & Patient == "Common Reference") %>% 
        select(Quantitative.column.name) %>% unlist()
    pg <- pg %>% filter(!rowSums(.[, ref_colnames] == 0) > 0)
    print(dim(pg))
    combined_pg_intensities_filteres[[center]] <- pg
}



[1] "Center1"
[1] 472  23
[1] 439  23
[1] "Center2"
[1] 500  22
[1] 439  22
[1] "Center3"
[1] 431  23
[1] 373  23


### join the reports

In [102]:

# all_pg_intensities <- data.frame()

# for(center in names(list_of_outputs)) {
#     pg_intensities <- combined_pg_intensities_filteres[[center]]
#     if(!(nrow(all_pg_intensities) == 0)) {
#         all_pg_intensities <- full_join(all_pg_intensities, pg_intensities, by = c("Fasta.headers"))
#     } else {
#         all_pg_intensities <- pg_intensities
#     }
# }

# dim(all_pg_intensities)

In [8]:

all_pg_intensities <- data.frame()

for(center in names(list_of_outputs)) {
    pg_intensities <- combined_pg_intensities_filteres[[center]]
    if(!(nrow(all_pg_intensities) == 0)) {
        all_pg_intensities <- inner_join(all_pg_intensities, pg_intensities, by = c("Fasta.headers"))
    } else {
        all_pg_intensities <- pg_intensities
    }
}

dim(all_pg_intensities)

In [9]:

all_pg_counts <- data.frame()

for(center in names(list_of_outputs)) {
    pg_counts <- combined_counts_intensities_filteres[[center]]
    if(!(nrow(all_pg_counts) == 0)) {
        all_pg_counts <- inner_join(all_pg_counts, pg_counts, by = c("Fasta.headers"))
    } else {
        all_pg_counts <- pg_counts
    }
}

# for each Fasta.headers take the min value of the counts for each row
all_pg_counts <- all_pg_counts %>%
  rowwise() %>%
  mutate(min_count = min(c_across(starts_with("Razor")), na.rm = TRUE)) %>%
  ungroup() %>%
  select(Fasta.headers, min_count)

dim(all_pg_counts)