In [2]:
# Load libraries 
library(BSgenome)
library(Biostrings)   
library(GenomeInfoDb)
library(rtracklayer)
library(GenomicFeatures)
#library(AnnotationDbi)
library(ArchR)
library(AnnotationForge) #"Only need for making annotation"
library(OrganismDbi)
library(ensembldb)
library(dplyr)
library(tidyverse)
library(Seurat)
library(SingleCellExperiment)
library( zellkonverter )
library( anndata )
library(BSgenome.Schisto.wormbase.WBPS15)# This bsgenome has all chromosomes 
library(cicero)
library(ChIPseeker)
library(monocle3)

Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, append, as.data.frame, basename, cbind, colnames,
    dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep,
    grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
    order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank,
    rbind, Reduce, rownames, sapply, setdiff, sort, table, tapply,
    union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

Loading required package: stats4


Attaching package: ‘S4Vectors’


The following objects are masked from ‘package:base’:

    expand.grid, I, unname


Loading required package: IRanges

Loading required package: GenomeInfoDb

Loading required package: GenomicRanges

Loading required package: Biostrings

Loading required package: XVector


Attachin

In [3]:
# Final ArchR object 
Smed= loadArchRProject(path = "ArchROutputs/Smed/ArchRProjs/Smed_final_ArchR_proj/", force = FALSE, showLogo = FALSE)
Sman =loadArchRProject(path = "ArchROutputs/Sman/ArchRProjs/Sman_final_ArchR_proj/", force = FALSE, showLogo = FALSE)
Mlig =loadArchRProject(path = "ArchROutputs/Mlig/ArchRProjs/Mlig_final_ArchR_proj/", force = FALSE, showLogo = FALSE)

Successfully loaded ArchRProject!

Successfully loaded ArchRProject!

Successfully loaded ArchRProject!



## Smed peak annotation 

In [8]:
# Here let's try to format the peak
Smed_peaks <- getPeakSet( Smed )
Smed_chr_order <- sort(seqlevels(Smed_peaks))
Smed_reordered_features <- list()
for(chr in Smed_chr_order)
    Smed_reordered_features[[chr]] = Smed_peaks[seqnames(Smed_peaks) == chr]
Smed_reordered_features <- Reduce("c", Smed_reordered_features)
names(Smed_reordered_features) <- sprintf("Peak%d", 1:length(Smed_reordered_features))

In [9]:
Smed_reordered_features

GRanges object with 317488 ranges and 4 metadata columns:
                  seqnames          ranges strand |     score       idx
                     <Rle>       <IRanges>  <Rle> | <integer> <integer>
       Peak1  dd_Smes_g4_1       7590-7935      * |        75         1
       Peak2  dd_Smes_g4_1       8523-9081      * |        75         2
       Peak3  dd_Smes_g4_1       9261-9542      * |       140         3
       Peak4  dd_Smes_g4_1     11180-11334      * |        77         4
       Peak5  dd_Smes_g4_1     11397-11701      * |       229         5
         ...           ...             ...    ... .       ...       ...
  Peak317484 dd_Smes_g4_99 2442858-2443360      * |        84      1057
  Peak317485 dd_Smes_g4_99 2444200-2444348      * |        70      1058
  Peak317486 dd_Smes_g4_99 2446643-2446791      * |        69      1059
  Peak317487 dd_Smes_g4_99 2448389-2448551      * |        81      1060
  Peak317488 dd_Smes_g4_99 2448733-2449276      * |       422      1061
      

In [10]:
# Let's annotate the peaks for what type they are so that I can potentially put them in to the obs 
Smed_txdb <- makeTxDbFromGFF("./Refs/dd_Smes_g4/Smes_SMESG_g4_hc_edit_v04.gtf")
#peak_file <- "/media/gary/Chew/Schisto_ATAC/schisto_all/cc_ENCODE_peaks_CT/itoverlap_overlap_all.narrowPeak"

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK



In [11]:
Smed_peakAnno <- annotatePeak(Smed_reordered_features, tssRegion=c(-1000, 1000),genomicAnnotationPriority = c("Promoter", "Exon", "Intron", "3UTR", "5UTR","Downstream", "Intergenic"),
                         TxDb=Smed_txdb, overlap="all")

>> preparing features information...		 2023-06-12 03:34:05 PM 
>> identifying nearest features...		 2023-06-12 03:34:05 PM 
>> calculating distance from peak to TSS...	 2023-06-12 03:34:11 PM 
>> assigning genomic annotation...		 2023-06-12 03:34:11 PM 
>> assigning chromosome lengths			 2023-06-12 03:34:20 PM 
>> done...					 2023-06-12 03:34:20 PM 


In [12]:
Smed_peak_annot <- as.data.frame(Smed_peakAnno@anno)
write_csv(Smed_peak_annot,"./Metadata/Smed.chipseekr_annots.csv")

## Sman peak annotation 

In [9]:
# Here let's try to format the peak
Sman_peaks <- getPeakSet( Sman )
Sman_chr_order <- sort(seqlevels(Sman_peaks))
Sman_reordered_features <- list()
for(chr in Sman_chr_order)
    Sman_reordered_features[[chr]] = Sman_peaks[seqnames(Sman_peaks) == chr]
Sman_reordered_features <- Reduce("c", Sman_reordered_features)
names(Sman_reordered_features) <- sprintf("Peak%d", 1:length(Sman_reordered_features))

In [10]:
Sman_reordered_features

GRanges object with 162349 ranges and 4 metadata columns:
                 seqnames        ranges strand |     score       idx        GC
                    <Rle>     <IRanges>  <Rle> | <integer> <integer> <numeric>
       Peak1      SM_V7_1   32709-32858      * |        74         1    0.3267
       Peak2      SM_V7_1   54285-54746      * |        75         2    0.3247
       Peak3      SM_V7_1   90095-90244      * |       101         3    0.4600
       Peak4      SM_V7_1  99821-100158      * |       237         4    0.2870
       Peak5      SM_V7_1 126020-126404      * |       264         5    0.3247
         ...          ...           ...    ... .       ...       ...       ...
  Peak162345 SM_V7_ZWU006   21874-22186      * |       168        13    0.2843
  Peak162346 SM_V7_ZWU006   23144-23374      * |        90        14    0.2727
  Peak162347 SM_V7_ZWU006   25587-25808      * |       108        15    0.3649
  Peak162348 SM_V7_ZWU006   27474-27762      * |       115        16    0

In [11]:
# Let's annotate the peaks for what type they are so that I can potentially put them in to the obs 
schisto_txdb <- makeTxDbFromGFF("/Refs/SM_V7/schistosoma_mansoni.PRJEA36577.WBPS15.canonical_geneset_converted.gtf")
#peak_file <- "/media/gary/Chew/Schisto_ATAC/schisto_all/cc_ENCODE_peaks_CT/itoverlap_overlap_all.narrowPeak"

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
"The "phase" metadata column contains non-NA values for features of type
  stop_codon. This information was ignored."
OK



In [12]:
Sman_peakAnno <- annotatePeak(Sman_reordered_features, tssRegion=c(-1000, 1000),genomicAnnotationPriority = c("Promoter", "Exon", "Intron", "3UTR", "5UTR","Downstream", "Intergenic"),
                         TxDb=schisto_txdb, overlap="all")

>> preparing features information...		 2023-06-12 03:10:45 PM 
>> identifying nearest features...		 2023-06-12 03:10:45 PM 


"Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': SM_V7_1001, SM_V7_1H005, SM_V7_1H008, SM_V7_1H012, SM_V7_1H013, SM_V7_1H021, SM_V7_1H022, SM_V7_1H023, SM_V7_1H025, SM_V7_1H026, SM_V7_1H028, SM_V7_1H031, SM_V7_1H032, SM_V7_1H033, SM_V7_1H034, SM_V7_1H035, SM_V7_1H037, SM_V7_1H038, SM_V7_1H041, SM_V7_1H042, SM_V7_1H044, SM_V7_2H013, SM_V7_2H015, SM_V7_2H016, SM_V7_2H017, SM_V7_2H020, SM_V7_2H022, SM_V7_2H023, SM_V7_2H027, SM_V7_2H028, SM_V7_2H029, SM_V7_2H033, SM_V7_2H034, SM_V7_2H035, SM_V7_3H002, SM_V7_3H005, SM_V7_3H007, SM_V7_3H010, SM_V7_3H012, SM_V7_3H014, SM_V7_3H018, SM_V7_3H019, SM_V7_3H020, SM_V7_3H021, SM_V7_3H022, SM_V7_4H005, SM_V7_4H008, SM_V7_4H010, SM_V7_4H013, SM_V7_4H015, SM_V7_4H022, SM_V7_4H023, SM_V7_4H024, SM_V7_4H028, SM_V7_4H029, SM_V7_4H036, SM_V7_4H038, SM_V7_5H003, SM_V7_5H004, SM_V7_5H006, SM_V7_5H016, SM_V7_5H019, SM_V7_5H020, SM_V7_5H021, SM_V7_5H023, SM_V7_5H027, SM_V7_6H005, SM_V7_6H011, SM_V7_6H014, SM_V7_6H016, SM_V7_6H0

>> calculating distance from peak to TSS...	 2023-06-12 03:10:48 PM 
>> assigning genomic annotation...		 2023-06-12 03:10:48 PM 


"Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': SM_V7_1001, SM_V7_1H005, SM_V7_1H008, SM_V7_1H012, SM_V7_1H013, SM_V7_1H021, SM_V7_1H022, SM_V7_1H023, SM_V7_1H025, SM_V7_1H026, SM_V7_1H028, SM_V7_1H031, SM_V7_1H032, SM_V7_1H033, SM_V7_1H034, SM_V7_1H035, SM_V7_1H037, SM_V7_1H038, SM_V7_1H041, SM_V7_1H042, SM_V7_1H044, SM_V7_2H013, SM_V7_2H015, SM_V7_2H016, SM_V7_2H017, SM_V7_2H020, SM_V7_2H022, SM_V7_2H023, SM_V7_2H027, SM_V7_2H028, SM_V7_2H029, SM_V7_2H033, SM_V7_2H034, SM_V7_2H035, SM_V7_3H002, SM_V7_3H005, SM_V7_3H007, SM_V7_3H010, SM_V7_3H012, SM_V7_3H014, SM_V7_3H018, SM_V7_3H019, SM_V7_3H020, SM_V7_3H021, SM_V7_3H022, SM_V7_4H005, SM_V7_4H008, SM_V7_4H010, SM_V7_4H013, SM_V7_4H015, SM_V7_4H022, SM_V7_4H023, SM_V7_4H024, SM_V7_4H028, SM_V7_4H029, SM_V7_4H036, SM_V7_4H038, SM_V7_5H003, SM_V7_5H004, SM_V7_5H006, SM_V7_5H016, SM_V7_5H019, SM_V7_5H020, SM_V7_5H021, SM_V7_5H023, SM_V7_5H027, SM_V7_6H005, SM_V7_6H011, SM_V7_6H014, SM_V7_6H016, SM_V7_6H0

>> assigning chromosome lengths			 2023-06-12 03:10:54 PM 
>> done...					 2023-06-12 03:10:54 PM 


In [13]:
Sman_peak_annot <- as.data.frame(Sman_peakAnno@anno)
write_csv(Sman_peak_annot,"./Metadata/Sman.chipseekr_annots.csv")

## Mlig peak annotation

In [11]:
# Here let's try to format the peak
Mlig_peaks <- getPeakSet( Mlig )
Mlig_chr_order <- sort(seqlevels(Mlig_peaks))
Mlig_reordered_features <- list()
for(chr in Mlig_chr_order)
    Mlig_reordered_features[[chr]] = Mlig_peaks[seqnames(Mlig_peaks) == chr]
Mlig_reordered_features <- Reduce("c", Mlig_reordered_features)
names(Mlig_reordered_features) <- sprintf("Peak%d", 1:length(Mlig_reordered_features))

In [10]:
Mlig_reordered_features

GRanges object with 236018 ranges and 4 metadata columns:
                seqnames      ranges strand |     score       idx        GC
                   <Rle>   <IRanges>  <Rle> | <integer> <integer> <numeric>
       Peak1 ML45REF0001     104-253      * |       108         1    0.3133
       Peak2 ML45REF0001   1623-2063      * |        93         2    0.5488
       Peak3 ML45REF0001   3130-3535      * |       263         3    0.5739
       Peak4 ML45REF0001   7171-7418      * |       134         4    0.6331
       Peak5 ML45REF0001 13231-13993      * |       262         5    0.5976
         ...         ...         ...    ... .       ...       ...       ...
  Peak236014 ML45REF0415    986-1420      * |        75         2    0.5977
  Peak236015 ML45REF0415   3425-3574      * |        69         3    0.5667
  Peak236016 ML45REF0415   5749-6284      * |       142         4    0.5429
  Peak236017 ML45REF0415   7555-7706      * |        70         5    0.5197
  Peak236018 ML45REF0415   776

In [12]:
# Let's annotate the peaks for what type they are so that I can potentially put them in to the obs 
Mlig_txdb <- makeTxDbFromGFF("/Refs/Mlig_4_5.v5/Mlig_RNA_4_5.v5.coregenes.Chew.gtf")
#peak_file <- "/media/gary/Chew/Schisto_ATAC/schisto_all/cc_ENCODE_peaks_CT/itoverlap_overlap_all.narrowPeak"

Import genomic features from the file as a GRanges object ... 
OK

Prepare the 'metadata' data frame ... 
OK

Make the TxDb object ... 
OK



In [14]:
Mlig_peakAnno <- annotatePeak(Mlig_reordered_features, tssRegion=c(-1000, 1000),genomicAnnotationPriority = c("Promoter", "Exon", "Intron", "3UTR", "5UTR","Downstream", "Intergenic"),
                         TxDb=Mlig_txdb, overlap="all")

>> preparing features information...		 2023-06-01 06:17:52 PM 
>> identifying nearest features...		 2023-06-01 06:17:52 PM 
>> calculating distance from peak to TSS...	 2023-06-01 06:17:57 PM 
>> assigning genomic annotation...		 2023-06-01 06:17:57 PM 
>> assigning chromosome lengths			 2023-06-01 06:18:06 PM 
>> done...					 2023-06-01 06:18:06 PM 


In [15]:
Mlig_peak_annot <- as.data.frame(Mlig_peakAnno@anno)
write_csv(peak_annot,"./Metadata/Mlig.chipseekr_annots.csv")