# Melanoma Cohort Analysis Pipeline

This notebook documents the analysis pipeline for analyzing the Melanoma cohort analyzed in the SpliceMutr paper. All bash scripts are made specifically for analysis on our personal clusters, but they outline the general structure and usage of the base R and python scripts for running the SpliceMutr pipeline. Modification of the bash files is necessary to directly run each script yourself.

# Aligning the Melanoma Cohort using STAR

## STAR and samtools usage, bash

In [29]:

"""

# job submission params
#!/bin/bash
#$ -N fastq2bams
#$ -S /bin/sh
#$ -l mem_free=25G,h_vmem=30G
#$ -o /fastq2bams/bam_fastq.o
#$ -e /fastq2bams/bam_fastq.e
#$ -M tpalme15@jhmi.edu
#$ -t 1-114 -tc 10

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

echo $(date)

# parse variables
NUM=$SGE_TASK_ID # number of the line from input file that has filename you want to use as input to STAR
fastq1=`sed "${NUM}q;d" /dcs04/fertig/data/theron/share/filenames.txt` # the fastq1 files for the cohort
fastq2="${fastq1/1.clipped/2.clipped}"
samplename=${fastq1%"_1.clipped.fastq.gz"}
samplename=${samplename##*/}
outprefix="/bams/${samplename}"

GENOME_DIR=/GRCh38_Ensembl99_sparseD3_sjdbOverhang99

# align fastqs using STAR
STAR --genomeDir $GENOME_DIR --readFilesIn ${fastq1} ${fastq2} --twopassMode Basic --outSAMstrandField intronMotif --outFileNamePrefix $outprefix --runThreadN 6 --readFilesCommand zcat --outSAMtype BAM Unsorted

# convenience variable for sam file created by STAR aligner
samfile="${outprefix}Aligned.out.sam"

# delete any reads where CIGAR and sequence length are inconsistent 
fixedsam=`echo $samfile | sed s/sam/fixed.sam/`
cat $samfile | awk '{if (length($10)==length($11)) print $0}' > $fixedsam

# convert sam file to bam file
samtools view -S -b $fixedsam > ${outprefix}.bam

# sort bam file
#samtools sort ${outprefix}.bam ${outprefix}.sorted

# index bam file
#samtools index ${outprefix}.sorted.bam

# rm sam/fixedsam/bam/ file now that sorted bam has been created from it
rm $samfile
rm $fixedsam
#rm ${outprefix}.bam

"""

'\n\n# job submission params\n#!/bin/bash\n#$ -N fastq2bams\n#$ -S /bin/sh\n#$ -l mem_free=25G,h_vmem=30G\n#$ -o /fastq2bams/bam_fastq.o\n#$ -e /fastq2bams/bam_fastq.e\n#$ -M tpalme15@jhmi.edu\n#$ -t 1-114 -tc 10\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\necho $(date)\n\n# parse variables\nNUM=$SGE_TASK_ID # number of the line from input file that has filename you want to use as input to STAR\nfastq1=`sed "${NUM}q;d" /dcs04/fertig/data/theron/share/filenames.txt` # the fastq1 files for the cohort\nfastq2="${fastq1/1.clipped/2.clipped}"\nsamplename=${fastq1%"_1.clipped.fastq.gz"}\nsamplename=${samplename##*/}\noutprefix="/bams/${samplename}"\n\nGENOME_DIR=/GRCh38_Ensembl99_sparseD3_sjdbOverhang99\n\n# align fastqs using STAR\nSTAR --genomeDir $GENOME_DIR --readFilesIn ${fastq1} ${fastq2} --twopassMode Basic --outSAMstrandField intronMotif --outFileNamePrefix $outprefix --runThreadN 6 --readFilesCommand zcat --outSAMtype BAM Unsorted\n\n# convenience 

# Generating the necessary files for analysis, R

## Loading in and processing the manifest file, R 

In [30]:

"""

manifest <- read_excel("/Valsamo/manifest.xlsx")
files_to_remove <- c("hg19MTERCC-ensembl75-genes-Q21777-Plate-1-E06_L65",
"hg19MTERCC-ensembl75-genes-Q21777-Plate-1-F12_L1.D707_508",
"hg19MTERCC-ensembl75-genes-Q23152+B4+H2+AG710464_L1.D705")
manifest <- manifest %>% dplyr::filter(!(Sample %in% files_to_remove))
manifest$Sample <- str_replace_all(manifest$Sample,"hg19MTERCC-ensembl75-genes-","")
fastq_files <- read.table("/Valsamo/fastq_files.txt")
SJ_files <- read.table("/Valsamo/SJ_files.txt")
SJ_files$sample_name <- vapply(SJ_files$V1,function(file){
  str_remove(file,"SJ.out.tab")
},character(1))
fastq_files$sample_name <- vapply(fastq_files$V1,function(file){
  str_remove(file,"_1.clipped.fastq.gz")
},character(1))
manifest$AX_TRTGRP <- vapply(manifest$AX_TRTGRP,function(TRTGRP){
  if (str_detect(TRTGRP,"IPI")){
    return("NIV-IPI")
  } else {
    return(TRTGRP)
  }
},character(1))

"""

'\n\nmanifest <- read_excel("/Valsamo/manifest.xlsx")\nfiles_to_remove <- c("hg19MTERCC-ensembl75-genes-Q21777-Plate-1-E06_L65",\n"hg19MTERCC-ensembl75-genes-Q21777-Plate-1-F12_L1.D707_508",\n"hg19MTERCC-ensembl75-genes-Q23152+B4+H2+AG710464_L1.D705")\nmanifest <- manifest %>% dplyr::filter(!(Sample %in% files_to_remove))\nmanifest$Sample <- str_replace_all(manifest$Sample,"hg19MTERCC-ensembl75-genes-","")\nfastq_files <- read.table("/Valsamo/fastq_files.txt")\nSJ_files <- read.table("/Valsamo/SJ_files.txt")\nSJ_files$sample_name <- vapply(SJ_files$V1,function(file){\n  str_remove(file,"SJ.out.tab")\n},character(1))\nfastq_files$sample_name <- vapply(fastq_files$V1,function(file){\n  str_remove(file,"_1.clipped.fastq.gz")\n},character(1))\nmanifest$AX_TRTGRP <- vapply(manifest$AX_TRTGRP,function(TRTGRP){\n  if (str_detect(TRTGRP,"IPI")){\n    return("NIV-IPI")\n  } else {\n    return(TRTGRP)\n  }\n},character(1))\n\n'

## Forming PRE-treatment and POST-treatment groups, R

In [31]:

"""

PRE_samples_ipi_naive <- manifest[manifest$AX_TIMETEMP=="PRE" & manifest$AX_BOR3!="NE",]
POST_samples <- manifest[manifest$AX_TIMETEMP=="POST",]
PRE_samples <- manifest[manifest$AX_TIMETEMP=="PRE",]

groups_and_junc_dir <- mod_path("/mnt/f/Valsamo/leafcutter_prep/run_20230320")
JHPCE_dir <- "/dcs04/fertig/data/theron/share/juncs"
comparisons <- list()
state <- "PRE"

"""


'\n\nPRE_samples_ipi_naive <- manifest[manifest$AX_TIMETEMP=="PRE" & manifest$AX_BOR3!="NE",]\nPOST_samples <- manifest[manifest$AX_TIMETEMP=="POST",]\nPRE_samples <- manifest[manifest$AX_TIMETEMP=="PRE",]\n\ngroups_and_junc_dir <- mod_path("/mnt/f/Valsamo/leafcutter_prep/run_20230320")\nJHPCE_dir <- "/dcs04/fertig/data/theron/share/juncs"\ncomparisons <- list()\nstate <- "PRE"\n\n'

## Creating comparisons for LeafCutterMD analysis, R

In [32]:

"""

comparison <- "baseline_vs_post_treatment"
targets<-POST_samples$Sample
comparators<-PRE_samples$Sample
a<-list()
a[["targets"]] <- targets
a[["comparators"]] <- comparators

for (target in targets){
  target_junc_file <- sprintf("%s/%s_baseline_POST_treatment_juncs.txt",groups_and_junc_dir,target)
  file_contents <- sprintf("%s.filt.junc",c(target,comparators))
  file_contents <- data.frame(file_contents)
  write.table(file_contents,target_junc_file,
            sep="\t",
            quote=F,
            col.names=F,
            row.names=F)
}
comparisons[[comparison]] <- a
saveRDS(comparisons,file=sprintf("%s/comparisons.rds",groups_and_junc_dir))

"""


'\n\ncomparison <- "baseline_vs_post_treatment"\ntargets<-POST_samples$Sample\ncomparators<-PRE_samples$Sample\na<-list()\na[["targets"]] <- targets\na[["comparators"]] <- comparators\n\nfor (target in targets){\n  target_junc_file <- sprintf("%s/%s_baseline_POST_treatment_juncs.txt",groups_and_junc_dir,target)\n  file_contents <- sprintf("%s.filt.junc",c(target,comparators))\n  file_contents <- data.frame(file_contents)\n  write.table(file_contents,target_junc_file,\n            sep="\t",\n            quote=F,\n            col.names=F,\n            row.names=F)\n}\ncomparisons[[comparison]] <- a\nsaveRDS(comparisons,file=sprintf("%s/comparisons.rds",groups_and_junc_dir))\n\n'

# Running LeafCutterMD outlier splice-junction usage analysis

## splicemutr_leafcutter_cluster_regtools.py and leafcutterMD.R usage

In [33]:

"""

# job submission params

#!/bin/bash
#$ -N leafcutter
#$ -S /bin/sh
#$ -N leafcutter
#$ -l mem_free=20G,h_vmem=25G
#$ -o /runLeafcutter/leaf_run_12072021.o
#$ -e /runLeafcutter/leaf_run_12072021.e
#$ -M tpalme15@jhmi.edu
#$ -t 1-117 -tc 15

echo $(date)

module load conda
source activate /miniconda3/envs/R-4.0.2

JUNC_DIR=/juncs
LEAF_SCRIPTS=/leafcutter/scripts
OUTLIER_FILES=/outlier_files.txt
JUNC_FILE=$(sed -n ${SGE_TASK_ID}p $OUTLIER_FILES)
JUNC_FILE=$(echo $JUNC_FILE)
SAMPLE=$(basename $JUNC_FILE | sed s/'.txt'/''/g)

cd $JUNC_DIR

echo "leafcutter_cluster_regtools"
python2 $LEAF_SCRIPTS/splicemutr_leafcutter_cluster_regtools.py -j $JUNC_FILE -o $SAMPLE -l 500000

echo "leafcutter_ds"
$LEAF_SCRIPTS/leafcutterMD.R --num_threads $NSLOTS -o $SAMPLE ${SAMPLE}_perind_numers.counts.gz

echo $(date)

"""


'\n\n# job submission params\n\n#!/bin/bash\n#$ -N leafcutter\n#$ -S /bin/sh\n#$ -N leafcutter\n#$ -l mem_free=20G,h_vmem=25G\n#$ -o /runLeafcutter/leaf_run_12072021.o\n#$ -e /runLeafcutter/leaf_run_12072021.e\n#$ -M tpalme15@jhmi.edu\n#$ -t 1-117 -tc 15\n\necho $(date)\n\nmodule load conda\nsource activate /miniconda3/envs/R-4.0.2\n\nJUNC_DIR=/juncs\nLEAF_SCRIPTS=/leafcutter/scripts\nOUTLIER_FILES=/outlier_files.txt\nJUNC_FILE=$(sed -n ${SGE_TASK_ID}p $OUTLIER_FILES)\nJUNC_FILE=$(echo $JUNC_FILE)\nSAMPLE=$(basename $JUNC_FILE | sed s/\'.txt\'/\'\'/g)\n\ncd $JUNC_DIR\n\necho "leafcutter_cluster_regtools"\npython2 $LEAF_SCRIPTS/splicemutr_leafcutter_cluster_regtools.py -j $JUNC_FILE -o $SAMPLE -l 500000\n\necho "leafcutter_ds"\n$LEAF_SCRIPTS/leafcutterMD.R --num_threads $NSLOTS -o $SAMPLE ${SAMPLE}_perind_numers.counts.gz\n\necho $(date)\n\n'

# LeafCutterMD Postprocessing

## Compiling comparison junctions ouput from LeafCutterMD, R

In [34]:

"""

comparison_junctions <- list()
leafcutter_files <- read.table(mod_path("/run_20230320/filenames.txt"),sep="\t")
leafcutter_effect_sizes <- read.table(mod_path("/run_20230320/effect_sizes.txt"),sep="\t")
for (i in seq(nrow(leafcutter_files))){
  print(i)
  outlier_file <- leafcutter_files[i,]
  outlier_juncs <- read.table(outlier_file,check.names = F)
  outlier_junc_cols <- str_replace_all(colnames(outlier_juncs),".filt","")
  colnames(outlier_juncs) <- outlier_junc_cols
  file_split <- strsplit(outlier_file,"_juncs_")[[1]]
  sample <- basename(file_split[1])
  sample <- substr(sample,1,nchar(sample))
  sample <- str_remove(sample,"_baseline_POST_treatment")
  eff_size_file <- sprintf("%s/%s_baseline_POST_treatment_juncs_effSize.txt",dirname(outlier_file),sample)
  eff_size <- read.table(mod_path(eff_size_file),sep="\t",check.names = F)
  outlier_juncs_cols <- str_replace_all(colnames(eff_size),".filt","")
  colnames(eff_size) <- outlier_juncs_cols
  diff_juncs <- rownames(eff_size)[abs(eff_size[,sample])>=0.6]
  sig_juncs <- rownames(outlier_juncs)[as.numeric(outlier_juncs[,sample]) <=0.05]
  comparison_junctions[[sample]] <- unique(c(comparison_junctions[[sample]],union(sig_juncs,diff_juncs)))
}

"""


'\n\ncomparison_junctions <- list()\nleafcutter_files <- read.table(mod_path("/run_20230320/filenames.txt"),sep="\t")\nleafcutter_effect_sizes <- read.table(mod_path("/run_20230320/effect_sizes.txt"),sep="\t")\nfor (i in seq(nrow(leafcutter_files))){\n  print(i)\n  outlier_file <- leafcutter_files[i,]\n  outlier_juncs <- read.table(outlier_file,check.names = F)\n  outlier_junc_cols <- str_replace_all(colnames(outlier_juncs),".filt","")\n  colnames(outlier_juncs) <- outlier_junc_cols\n  file_split <- strsplit(outlier_file,"_juncs_")[[1]]\n  sample <- basename(file_split[1])\n  sample <- substr(sample,1,nchar(sample))\n  sample <- str_remove(sample,"_baseline_POST_treatment")\n  eff_size_file <- sprintf("%s/%s_baseline_POST_treatment_juncs_effSize.txt",dirname(outlier_file),sample)\n  eff_size <- read.table(mod_path(eff_size_file),sep="\t",check.names = F)\n  outlier_juncs_cols <- str_replace_all(colnames(eff_size),".filt","")\n  colnames(eff_size) <- outlier_juncs_cols\n  diff_juncs <- 

In [35]:
## Splitting LeafCutterMD splice junctions for SpliceMutr transcript formation

# Forming transcripts using LeafCutterMD 

## form_transcripts.R usage

# Calculating the coding potential of SpliceMutr-formed transcripts

## calc_coding_potential.R usage

## combine_splicemutr.R usage

In [36]:

"""

# job submission params

#!/bin/sh
#$ -N combine_splicemutr
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=5G
#$ -o /combine_splicemutr/splice_comb.o
#$ -e /combine_splicemutr/splice_comb.e
#$ -M tpalme15@jhmi.edu

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

OUT=/run_20230320/combine_splicemutr_out_cp
SPLICE_FILES=/run_20230320/formed_transcripts/filenames_cp.txt
SCRIPT_DIR=/splicemute/scripts

$SCRIPT_DIR/combine_splicemutr.R -o $OUT -s $SPLICE_FILES

echo $(date)

"""


'\n\n# job submission params\n\n#!/bin/sh\n#$ -N combine_splicemutr\n#$ -S /bin/sh\n#$ -l mem_free=5G,h_vmem=5G\n#$ -o /combine_splicemutr/splice_comb.o\n#$ -e /combine_splicemutr/splice_comb.e\n#$ -M tpalme15@jhmi.edu\n\necho $(date)\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\nOUT=/run_20230320/combine_splicemutr_out_cp\nSPLICE_FILES=/run_20230320/formed_transcripts/filenames_cp.txt\nSCRIPT_DIR=/splicemute/scripts\n\n$SCRIPT_DIR/combine_splicemutr.R -o $OUT -s $SPLICE_FILES\n\necho $(date)\n\n'

# Processing peptides output from the combined SpliceMutr output

## process_peptides.R usage

In [37]:

"""

# job submission params

#!/bin/sh
#$ -N process_peptides
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=5G
#$ -o /process_peptides/peps_proc.o
#$ -e /process_peptides/peps_proc.e
#$ -M tpalme15@jhmi.edu
#$ -m ea

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

SCRIPT_DIR=/splicemute/inst
PEPTIDES=/run_20230320/combine_splicemutr_out_cp/proteins.txt
OUT_DIR=/run_20230320/process_peptides_out
KMER_LENGTH=9

$SCRIPT_DIR/process_peptides.py -p $PEPTIDES -o $OUT_DIR -k $KMER_LENGTH

"""


'\n\n# job submission params\n\n#!/bin/sh\n#$ -N process_peptides\n#$ -S /bin/sh\n#$ -l mem_free=5G,h_vmem=5G\n#$ -o /process_peptides/peps_proc.o\n#$ -e /process_peptides/peps_proc.e\n#$ -M tpalme15@jhmi.edu\n#$ -m ea\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\nSCRIPT_DIR=/splicemute/inst\nPEPTIDES=/run_20230320/combine_splicemutr_out_cp/proteins.txt\nOUT_DIR=/run_20230320/process_peptides_out\nKMER_LENGTH=9\n\n$SCRIPT_DIR/process_peptides.py -p $PEPTIDES -o $OUT_DIR -k $KMER_LENGTH\n\n'

# Performing HLA genotyping on the melanoma cohort

## extract.py and genotype.py usage

In [38]:

"""

# job submission params

#!/bin/sh
#$ -N arcashla_geno
#$ -S /bin/sh
#$ -l mem_free=10G,h_vmem=15G
#$ -o /arcashla/arcas_geno.o
#$ -e /arcashla/arcas_geno.e
#$ -M tpalme15@jhmi.edu
#$ -m e
# -t 1-117 -tc 30

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/arcashla

GENOTYPES_DIR=/genotypes
FILENAMES_FILE=/bamfiles.txt # all bamfiles for cohort listed in file
FILE=$(sed -n ${SGE_TASK_ID}p $FILENAMES_FILE)
FILE_BASE=$(basename $FILE)
FILE_DIR=$GENOTYPES_DIR/${FILE_BASE}_dir
mkdir $FILE_DIR

ARCAS_SCRIPTS=/users/tpalmer/arcasHLA/scripts

# sort bam file
samtools sort -o ${FILE}.sorted $FILE

python $ARCAS_SCRIPTS/extract.py ${FILE} -o $FILE_DIR -v

cd $FILE_DIR
#python $ARCAS_SCRIPTS/reference.py --update

FASTQ1=$(ls *.extracted.1*)
FASTQ2=$(ls *.extracted.2*)
python $ARCAS_SCRIPTS/genotype.py $FASTQ1 $FASTQ2 -g A,B,C,DPA1,DPB1,DQA1,DQB1,DRA,DRB1 -o $FILE_DIR -v

"""


'\n\n# job submission params\n\n#!/bin/sh\n#$ -N arcashla_geno\n#$ -S /bin/sh\n#$ -l mem_free=10G,h_vmem=15G\n#$ -o /arcashla/arcas_geno.o\n#$ -e /arcashla/arcas_geno.e\n#$ -M tpalme15@jhmi.edu\n#$ -m e\n# -t 1-117 -tc 30\n\necho $(date)\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/arcashla\n\nGENOTYPES_DIR=/genotypes\nFILENAMES_FILE=/bamfiles.txt # all bamfiles for cohort listed in file\nFILE=$(sed -n ${SGE_TASK_ID}p $FILENAMES_FILE)\nFILE_BASE=$(basename $FILE)\nFILE_DIR=$GENOTYPES_DIR/${FILE_BASE}_dir\nmkdir $FILE_DIR\n\nARCAS_SCRIPTS=/users/tpalmer/arcasHLA/scripts\n\n# sort bam file\nsamtools sort -o ${FILE}.sorted $FILE\n\npython $ARCAS_SCRIPTS/extract.py ${FILE} -o $FILE_DIR -v\n\ncd $FILE_DIR\n#python $ARCAS_SCRIPTS/reference.py --update\n\nFASTQ1=$(ls *.extracted.1*)\nFASTQ2=$(ls *.extracted.2*)\npython $ARCAS_SCRIPTS/genotype.py $FASTQ1 $FASTQ2 -g A,B,C,DPA1,DPB1,DQA1,DQB1,DRA,DRB1 -o $FILE_DIR -v\n\n'

JSON file genotypes output from the above section are compiled into a class_1_alleles file containing the unique set of HLA class 1 alleles, 1 per line, and a genotypes file of R type .rds that contains a list with sample as names and the HLA genotype as a character vector per list element.

# Running MHCnuggets on the processed peptides

## runMHCnuggets_ind.py usage

In [39]:

"""

# job submission params

#!/bin/sh
#$ -N mhcnuggets
#$ -S /bin/sh
#$ -l mem_free=10G,h_vmem=15G
#$ -o /running_mhcnuggets/mhc_run.o
#$ -e /running_mhcnuggets/mhc_run.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-82 -tc 82

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

TYPE="I"
INPUT_KMERS=/process_peptides_out/peps_9.txt
MHC_ALLELE_FILE=/running_mhcnuggets/class_1_alleles.txt
ALLELE=$(sed -n ${SGE_TASK_ID}p $MHC_ALLELE_FILE)
OUT_DIR=/running_mhcnuggets/mhcnuggets_out
SCRIPT_DIR=/users/tpalmer/splicemute/inst

$SCRIPT_DIR/runMHCnuggets_ind.py -t $TYPE -k $INPUT_KMERS -m $ALLELE -o $OUT_DIR

echo $(date)

"""


'\n\n# job submission params\n\n#!/bin/sh\n#$ -N mhcnuggets\n#$ -S /bin/sh\n#$ -l mem_free=10G,h_vmem=15G\n#$ -o /running_mhcnuggets/mhc_run.o\n#$ -e /running_mhcnuggets/mhc_run.e\n#$ -M tpalme15@jhmi.edu\n#$ -m ea\n#$ -t 1-82 -tc 82\n\necho $(date)\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\nTYPE="I"\nINPUT_KMERS=/process_peptides_out/peps_9.txt\nMHC_ALLELE_FILE=/running_mhcnuggets/class_1_alleles.txt\nALLELE=$(sed -n ${SGE_TASK_ID}p $MHC_ALLELE_FILE)\nOUT_DIR=/running_mhcnuggets/mhcnuggets_out\nSCRIPT_DIR=/users/tpalmer/splicemute/inst\n\n$SCRIPT_DIR/runMHCnuggets_ind.py -t $TYPE -k $INPUT_KMERS -m $ALLELE -o $OUT_DIR\n\necho $(date)\n\n'

# Processing the binding affinity predictions output from MHCnuggets

## process_bindaff.py usage

In [40]:

"""

# job submission params

#!/bin/sh
#$ -N mhcnuggets
#$ -S /bin/sh
#$ -l mem_free=10G,h_vmem=15G
#$ -o /running_mhcnuggets/mhc_run.o
#$ -e /running_mhcnuggets/mhc_run.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-82 -tc 82

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

TYPE="I"
INPUT_KMERS=/process_peptides_out/peps_9.txt
MHC_ALLELE_FILE=/running_mhcnuggets/class_1_alleles.txt
ALLELE=$(sed -n ${SGE_TASK_ID}p $MHC_ALLELE_FILE)
#ALLELE=$(sed -n 1p $MHC_ALLELE_FILE)
OUT_DIR=/running_mhcnuggets/mhcnuggets_out
SCRIPT_DIR=/splicemute/inst

$SCRIPT_DIR/runMHCnuggets_ind.py -t $TYPE -k $INPUT_KMERS -m $ALLELE -o $OUT_DIR

echo $(date)

"""


'\n\n# job submission params\n\n#!/bin/sh\n#$ -N mhcnuggets\n#$ -S /bin/sh\n#$ -l mem_free=10G,h_vmem=15G\n#$ -o /running_mhcnuggets/mhc_run.o\n#$ -e /running_mhcnuggets/mhc_run.e\n#$ -M tpalme15@jhmi.edu\n#$ -m ea\n#$ -t 1-82 -tc 82\n\necho $(date)\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\nTYPE="I"\nINPUT_KMERS=/process_peptides_out/peps_9.txt\nMHC_ALLELE_FILE=/running_mhcnuggets/class_1_alleles.txt\nALLELE=$(sed -n ${SGE_TASK_ID}p $MHC_ALLELE_FILE)\n#ALLELE=$(sed -n 1p $MHC_ALLELE_FILE)\nOUT_DIR=/running_mhcnuggets/mhcnuggets_out\nSCRIPT_DIR=/splicemute/inst\n\n$SCRIPT_DIR/runMHCnuggets_ind.py -t $TYPE -k $INPUT_KMERS -m $ALLELE -o $OUT_DIR\n\necho $(date)\n\n'

# Extracting binding data per HLA allele

## extract_data.py usage

In [41]:

"""

# job submission params

#!/bin/sh
#$ -N extract_data
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=10G
#$ -o /extract_data/data_extract.o
#$ -e /extract_data/data_extract.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-82 -tc 82

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

ALLELE_FILES=/running_mhcnuggets/class_1_alleles.txt
ALLELE=$(sed -n ${SGE_TASK_ID}p $ALLELE_FILES)
PICKLE_DIR=/process_bindaff_out

SCRIPT_DIR=/splicemute/inst

$SCRIPT_DIR/extract_data.py -a $ALLELE -p $PICKLE_DIR -b 9 -e 10

echo $(date)

"""


'\n\n# job submission params\n\n#!/bin/sh\n#$ -N extract_data\n#$ -S /bin/sh\n#$ -l mem_free=5G,h_vmem=10G\n#$ -o /extract_data/data_extract.o\n#$ -e /extract_data/data_extract.e\n#$ -M tpalme15@jhmi.edu\n#$ -m ea\n#$ -t 1-82 -tc 82\n\necho $(date)\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\nALLELE_FILES=/running_mhcnuggets/class_1_alleles.txt\nALLELE=$(sed -n ${SGE_TASK_ID}p $ALLELE_FILES)\nPICKLE_DIR=/process_bindaff_out\n\nSCRIPT_DIR=/splicemute/inst\n\n$SCRIPT_DIR/extract_data.py -a $ALLELE -p $PICKLE_DIR -b 9 -e 10\n\necho $(date)\n\n'

# Generating SpliceMutr binding data based on genotype

## valsamo_analyze_splicemutr.R usage

In [42]:

"""

# job submission params

#!/bin/sh
#$ -N analyze_splicemutr
#$ -S /bin/sh
#$ -l mem_free=3G,h_vmem=10G
#$ -o /analyze_splicemutr/splice_analyze.o
#$ -e /analyze_splicemutr/splice_analyze.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-114 -tc 114

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

GENOTYPES=/analyze_splicemutr/genotypes.rds
SUMMARY_DIR=/process_bindaff/process_bindaff_out
SPLICE_DAT_FILE=/combine_splicemutr_out/data_splicemutr_all_pep.rds
COUNTS_FILES=/inner_juncs/filenames.txt # .junc files
COUNTS_FILE=$(sed -n ${SGE_TASK_ID}p $COUNTS_FILES)
OUT_DIR=/analyze_splicemutr_out
SCRIPT_DIR=/splicemute/scripts

$SCRIPT_DIR/valsamo_analyze_splicemutr.R -g $GENOTYPES -s $SUMMARY_DIR -d $SPLICE_DAT_FILE -c $COUNTS_FILE -o $OUT_DIR

echo $(date)

"""


'\n\n# job submission params\n\n#!/bin/sh\n#$ -N analyze_splicemutr\n#$ -S /bin/sh\n#$ -l mem_free=3G,h_vmem=10G\n#$ -o /analyze_splicemutr/splice_analyze.o\n#$ -e /analyze_splicemutr/splice_analyze.e\n#$ -M tpalme15@jhmi.edu\n#$ -m ea\n#$ -t 1-114 -tc 114\n\necho $(date)\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\nGENOTYPES=/analyze_splicemutr/genotypes.rds\nSUMMARY_DIR=/process_bindaff/process_bindaff_out\nSPLICE_DAT_FILE=/combine_splicemutr_out/data_splicemutr_all_pep.rds\nCOUNTS_FILES=/inner_juncs/filenames.txt # .junc files\nCOUNTS_FILE=$(sed -n ${SGE_TASK_ID}p $COUNTS_FILES)\nOUT_DIR=/analyze_splicemutr_out\nSCRIPT_DIR=/splicemute/scripts\n\n$SCRIPT_DIR/valsamo_analyze_splicemutr.R -g $GENOTYPES -s $SUMMARY_DIR -d $SPLICE_DAT_FILE -c $COUNTS_FILE -o $OUT_DIR\n\necho $(date)\n\n'

# Creating the splice junction expression object

## create_junc_expr.R usage

In [43]:

"""

# job submission params

#!/bin/sh
#$ -N create_junc_expression
#$ -S /bin/sh
#$ -l mem_free=20G,h_vmem=25G
#$ -o /create_junc_expression/expr_junc.o
#$ -e /create_junc_expression/expr_junc.e
#$ -M tpalme15@jhmi.edu
#$ -m ea 

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

SCRIPT_DIR=/splicemute/scripts
JUNC_DIR=juncs/inner_juncs
JUNC_FILES=/juncs/inner_juncs/filenames.txt # ls of the junc files and their directories
OUT_DIR=/create_junc_expression_out

$SCRIPT_DIR/create_junc_expr.R -j $JUNC_DIR -f $JUNC_FILES -o $OUT_DIR

echo $(date)

"""


'\n\n# job submission params\n\n#!/bin/sh\n#$ -N create_junc_expression\n#$ -S /bin/sh\n#$ -l mem_free=20G,h_vmem=25G\n#$ -o /create_junc_expression/expr_junc.o\n#$ -e /create_junc_expression/expr_junc.e\n#$ -M tpalme15@jhmi.edu\n#$ -m ea \n\necho $(date)\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\nSCRIPT_DIR=/splicemute/scripts\nJUNC_DIR=juncs/inner_juncs\nJUNC_FILES=/juncs/inner_juncs/filenames.txt # ls of the junc files and their directories\nOUT_DIR=/create_junc_expression_out\n\n$SCRIPT_DIR/create_junc_expr.R -j $JUNC_DIR -f $JUNC_FILES -o $OUT_DIR\n\necho $(date)\n\n'

# Generating gene expression data for the cohort

## featureCounts usage

In [44]:

"""

# job submission params

#!/bin/bash
#$ -N featurecounts
#$ -S /bin/sh
#$ -l mem_free=1G,h_vmem=1G
#$ -o /users/tpalmer/valsamo/featurecounts/feature_run.o
#$ -e /users/tpalmer/valsamo/featurecounts/feature_run.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -pe local 20
#$ -R y
#$ -t 1-117 -tc 1

module load featurecounts/2.0.0

echo $(date)

# parse variables
NUM=$SGE_TASK_ID # number of the line from input file that has filename you want to use as input to STAR
#NUM=1
BAM_FILE=/dcs04/fertig/data/theron/share/bams/bamfiles.txt
cd /dcs04/fertig/data/theron/share/bams
BAM=$(sed -n ${SGE_TASK_ID}p $BAM_FILE)

GTF_FILE=/users/tpalmer/valsamo/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/Homo_sapiens.GRCh38.99.gtf
OUT=/users/tpalmer/valsamo/featurecounts_out/$(basename $BAM)_feature_counts.txt

featureCounts -F GTF -a $GTF_FILE -O -s 0 -M -T 20 --largestOverlap --minOverlap 8 -p -C --donotsort -o $OUT $BAM

"""


'\n\n# job submission params\n\n#!/bin/bash\n#$ -N featurecounts\n#$ -S /bin/sh\n#$ -l mem_free=1G,h_vmem=1G\n#$ -o /users/tpalmer/valsamo/featurecounts/feature_run.o\n#$ -e /users/tpalmer/valsamo/featurecounts/feature_run.e\n#$ -M tpalme15@jhmi.edu\n#$ -m ea\n#$ -pe local 20\n#$ -R y\n#$ -t 1-117 -tc 1\n\nmodule load featurecounts/2.0.0\n\necho $(date)\n\n# parse variables\nNUM=$SGE_TASK_ID # number of the line from input file that has filename you want to use as input to STAR\n#NUM=1\nBAM_FILE=/dcs04/fertig/data/theron/share/bams/bamfiles.txt\ncd /dcs04/fertig/data/theron/share/bams\nBAM=$(sed -n ${SGE_TASK_ID}p $BAM_FILE)\n\nGTF_FILE=/users/tpalmer/valsamo/GRCh38_Ensembl99_sparseD3_sjdbOverhang99/Homo_sapiens.GRCh38.99.gtf\nOUT=/users/tpalmer/valsamo/featurecounts_out/$(basename $BAM)_feature_counts.txt\n\nfeatureCounts -F GTF -a $GTF_FILE -O -s 0 -M -T 20 --largestOverlap --minOverlap 8 -p -C --donotsort -o $OUT $BAM\n\n'

# Combining the featurecounts files

## combine_featurecounts.R usage

In [45]:

"""

# job submission params

#!/bin/bash
#$ -N featurecounts
#$ -S /bin/sh
#$ -l mem_free=10G,h_vmem=15G
#$ -o /featurecounts/feature_comb.o
#$ -e /featurecounts/feature_comb.e
#$ -M tpalme15@jhmi.edu
#$ -m ea

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

echo $(date)

# parse variables
FEATURECOUNTS_FILE=/featurecounts_out/filenames.txt # all of the featurecounts files for the cohort
SCRIPT_DIR=/splicemute/scripts

$SCRIPT_DIR/combine_featurecounts.R -f $FEATURECOUNTS_FILE

"""


'\n\n# job submission params\n\n#!/bin/bash\n#$ -N featurecounts\n#$ -S /bin/sh\n#$ -l mem_free=10G,h_vmem=15G\n#$ -o /featurecounts/feature_comb.o\n#$ -e /featurecounts/feature_comb.e\n#$ -M tpalme15@jhmi.edu\n#$ -m ea\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\necho $(date)\n\n# parse variables\nFEATURECOUNTS_FILE=/featurecounts_out/filenames.txt # all of the featurecounts files for the cohort\nSCRIPT_DIR=/splicemute/scripts\n\n$SCRIPT_DIR/combine_featurecounts.R -f $FEATURECOUNTS_FILE\n\n'

# Calculating the splicing antigenicity per sample

## calc_gene_metric_len_norm.R usage

In [46]:

"""

# job submission params

#!/bin/sh
#$ -N calc_gene_metric
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=10G
#$ -o /calc_gene_metric/gene_calc_len_norm.o
#$ -e /calc_gene_metric/gene_calc_len_norm.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-22 -tc 22

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

SCRIPT_DIR=/users/tpalmer/splicemute/scripts
GENE_EXPRESSION=/create_gene_expression/featurecounts_all_vst.rds
SPLICE_DAT_FILES=/create_comparisons_out/filenames.txt
SPLICE_DAT_FILE=$(sed -n ${SGE_TASK_ID}p $SPLICE_DAT_FILES)
COMPARISON=$(echo $(echo $(basename $SPLICE_DAT_FILE) | sed 's/splice_dat_//g') | sed 's/.rds//g')
KMER_COUNTS=/create_comparisons_out/kmers_specific_${COMPARISON}.rds
JUNC_EXPR_FILE=/create_junc_expression_out/junc_expr_combined_vst.rds
OUT_PREFIX=/calc_gene_metric_out/${COMPARISON}

$SCRIPT_DIR/calc_gene_metric_len_norm.R -g $GENE_EXPRESSION -s $SPLICE_DAT_FILE -k $KMER_COUNTS -j $JUNC_EXPR_FILE -o $OUT_PREFIX

echo $(date)

"""


"\n\n# job submission params\n\n#!/bin/sh\n#$ -N calc_gene_metric\n#$ -S /bin/sh\n#$ -l mem_free=5G,h_vmem=10G\n#$ -o /calc_gene_metric/gene_calc_len_norm.o\n#$ -e /calc_gene_metric/gene_calc_len_norm.e\n#$ -M tpalme15@jhmi.edu\n#$ -m ea\n#$ -t 1-22 -tc 22\n\necho $(date)\n\nmodule load conda\nsource activate /users/tpalmer/miniconda3/envs/R-4.0.2\n\nSCRIPT_DIR=/users/tpalmer/splicemute/scripts\nGENE_EXPRESSION=/create_gene_expression/featurecounts_all_vst.rds\nSPLICE_DAT_FILES=/create_comparisons_out/filenames.txt\nSPLICE_DAT_FILE=$(sed -n ${SGE_TASK_ID}p $SPLICE_DAT_FILES)\nCOMPARISON=$(echo $(echo $(basename $SPLICE_DAT_FILE) | sed 's/splice_dat_//g') | sed 's/.rds//g')\nKMER_COUNTS=/create_comparisons_out/kmers_specific_${COMPARISON}.rds\nJUNC_EXPR_FILE=/create_junc_expression_out/junc_expr_combined_vst.rds\nOUT_PREFIX=/calc_gene_metric_out/${COMPARISON}\n\n$SCRIPT_DIR/calc_gene_metric_len_norm.R -g $GENE_EXPRESSION -s $SPLICE_DAT_FILE -k $KMER_COUNTS -j $JUNC_EXPR_FILE -o $OUT_PRE