# Normalize new marker counts using gc content and fq-fq normalization

## Load required libraries

In [None]:
library(EDASeq)
library(Biostrings)
library(ggplot2)
library(plotly)

## Load new markers

In [None]:
extracted_df <- read.csv("/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/extracted_raw_counts_pairwise_marker_region.csv",
                         row.names = 1, check.names = FALSE)


In [None]:
head(extracted_df)

## GC content in cell type marker regions

In [None]:
# Load sequences from fasta
fasta <- readDNAStringSet("/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/new_pairwise_cell_types_markers.fa")
                          
# Compute GC content
gc_content <- letterFrequency(fasta, letters = c("G", "C"), as.prob = TRUE)
gc_content <- rowSums(gc_content)

# Name the gc_content vector with sequence names from fasta
names(gc_content) <- names(fasta)

head(gc_content)


In [None]:
# Correctly assign names from fasta
names(gc_content) <- names(fasta)

# Remove ">" character from fasta names
names(gc_content) <- sub("^>", "", names(gc_content))

# Replace ":" with "-" in gc_content names to match extracted_df rownames
names(gc_content) <- gsub(":", "-", names(gc_content))

# Verify again
head(names(gc_content))
head(rownames(extracted_df))


## FQ-FQ normalization

In [None]:
dataSet <- newSeqExpressionSet(
  counts = as.matrix(extracted_df),
  featureData = data.frame(gc = gc_content, row.names = names(gc_content))
)


In [None]:
dataWithin <- withinLaneNormalization(dataSet, "gc", which="full")
dataNorm <- betweenLaneNormalization(dataWithin, which="full")
normCounts <- normCounts(dataNorm)


In [None]:
head(normCounts)

In [None]:
colnames(normCounts) <- gsub("_[^_]+$", "", colnames(normCounts))
head(normCounts)


## Save files as csv

In [None]:
# Save to CSV:
write.csv(normCounts, file = "/mnt/DATA3/daniel/project/04_DA_and_reference_building/data/normCounts_cell_type_pairwise_markers.csv", row.names = TRUE)
