# The TCGA SpliceMutr Analysis Pipeline

This notebook documents the analysis pipeline for analyzing the TCGA data using SpliceMutr. All bash scripts are made specifically for analysis on our personal clusters, but they outline the general structure and usage of the base R and python scripts for running the pipeline. Modification of the bash files is necessary to directly run each script yourself. 

# Extracting the recount3 information for the TCGA data:

## recount3_tcga_juncs_init.R usage

In [None]:

"""

# job submission params

#!/bin/sh
#$ -N recount3_init
#$ -S /bin/sh
#$ -l mem_free=30G,h_vmem=30G
#$ -o /TCGA/recount3_juncs/juncs_init.o
#$ -e /TCGA/recount3_juncs/juncs_init.e
#$ -M tpalme15@jhmi.edu

echo $(date)

module load conda
source activate /miniconda3/envs/R-4.0.2

OUT=/TCGA_juncs
SCRIPT_DIR=/splicemute/scripts

$SCRIPT_DIR/recount3_tcga_juncs_init.R -o $OUT

echo $(date)

"""


# Forming the recount3 junctions into .junc files, forming the groups and the junc files per TCGA cancer subtype as well

## recount3_tcga_juncs.R usage

In [None]:

"""

# job submission params

#!/bin/sh
#$ -N recount3_form
#$ -S /bin/sh
#$ -l mem_free=30G,h_vmem=30G
#$ -o /recount3_juncs/juncs_form.o
#$ -e /TCGA/recount3_juncs/juncs_form.e
#$ -M tpalme15@jhmi.edu

echo $(date)

module load conda
source activate /miniconda3/envs/R-4.0.2

OUT=/recount3_juncs/TCGA_juncs
SCRIPT_DIR=/splicemute/scripts

$SCRIPT_DIR/recount3_tcga_juncs.R -o $OUT

echo $(date)

"""


# running LeafCutter for each TCGA cancer subtype

## splicemutr_leafcutter_cluster_regtools.py, leafcutter_ds.R, and prepare_results.R usage

In [None]:

"""

# job submission params

#!/bin/bash
#$ -N leafcutter
#$ -S /bin/sh
#$ -l mem_free=25G,h_vmem=25G
#$ -o /runLeafCutter/leaf_run.o
#$ -e /runLeafCutter/leaf_run.e
#$ -M tpalme15@jhmi.edu

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

JUNC_DIR=/splice_junctions
LEAF_SCRIPTS=/leafcutter/scripts
REF_DIR=/leafcutter_annotations
LEAFVIZ_DIR=/leafcutter/leafviz
GROUPS_FILE=/groups_file.txt

echo "leafcutter_cluster_regtools"
python2 $LEAF_SCRIPTS/splicemutr_leafcutter_cluster_regtools.py -j $JUNC_DIR/junc_file.txt -r $JUNC_DIR -o data -l 500000

echo "leafcutter_ds"
LEAF_SCRIPTS/leafcutter_ds.R --num_threads 1 --exon_file=$REF_DIR/G026.exons.txt.gz -o $JUNC_DIR/leafcutter_ds $JUNC_DIR/data_perind_numers.counts.gz $GROUPS_FILE

echo "prepare_results"
$LEAFVIZ_DIR/prepare_results.R -o $JUNC_DIR/data.Rdata -m $GROUPS_FILE $JUNC_DIR/data_perind_numers.counts.gz $JUNC_DIR/leafcutter_ds_cluster_significance.txt $JUNC_DIR/leafcutter_ds_effect_sizes.txt $REF_DIR/G026

echo $(date)

"""


# Downloading the recount3 gene expression per TCGA sample

## recount3_extract_gene_expression.R usage

In [None]:

"""

# job submission params

#!/bin/sh
#$ -N recount3_expression
#$ -S /bin/sh
#$ -l mem_free=30G,h_vmem=30G
#$ -o /recount3_juncs/gene_expression.o
#$ -e /TCGA/recount3_juncs/gene_expression.e
#$ -M tpalme15@jhmi.edu

echo $(date)

module load conda
source activate /miniconda3/envs/R-4.0.2

OUT=/recount3_juncs/TCGA_juncs
CANCER_TYPE="PAAD" # an example of a TCGA cancer type to input
SCRIPT_DIR=/splicemute/scripts

$SCRIPT_DIR/recount3_extract_gene_expression.R -o $OUT -c $CANCER_TYPE

echo $(date)

"""


# Forming the junction expression object

## create_junc_expr_TCGA.R usage

In [None]:

"""

# job submission params
#!/bin/sh
#$ -N create_junc_expr
#$ -S /bin/sh
#$ -l mem_free=30G,h_vmem=30G
#$ -o /create_junc_expr/junc_expr.o
#$ -e /create_junc_expr/junc_expr.e
#$ -M tpalme15@jhmi.edu
#$ -t 1-15 -tc 15

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

TCGA_ROOT_DIR=/splicemutr_TCGA
TCGA_CANCER_FILE=/splicemutr_TCGA/cancer_dirs.txt # a file containing the one TCGA cancer subtype per line
TCGA_CANCER=$(sed -n ${SGE_TASK_ID}p $TCGA_CANCER_FILE)
OUT_DIR=$TCGA_ROOT_DIR/$TCGA_CANCER/junction_counts

if [[ ! -d $OUT_DIR ]]
then
        mkdir $OUT_DIR
fi

JUNC_RSE_FILE=$TCGA_ROOT_DIR/$TCGA_CANCER/junc_rse.rds
SPLICE_DAT_FILE=$TCGA_ROOT_DIR/$TCGA_CANCER/combine_splicemutr_out/data_splicemutr_all_pep.rds # coimbined SpliceMutr output file for filtering junction expression by relevant splice junctions

SCRIPT_DIR=/users/tpalmer/splicemute/scripts

$SCRIPT_DIR/create_junc_expr_TCGA.R -j $JUNC_RSE_FILE -s $SPLICE_DAT_FILE -o $OUT_DIR

echo $(date)

"""


# Saving the introns output from LeafCutter

## save_introns.R usage

In [None]:

"""

# job submission params
#!/bin/sh
#$ -N create_junc_expr
#$ -S /bin/sh
#$ -l mem_free=10G,h_vmem=10G
#$ -o /save_introns/introns_save.o
#$ -e /save_introns/introns_save.e
#$ -M tpalme15@jhmi.edu
#$ -t 1-15 -tc 15

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

TCGA_ROOT_DIR=/splicemutr_TCGA
TCGA_CANCER_FILE=/splicemutr_TCGA/cancer_dirs.txt # a file containing the one TCGA cancer subtype per line
TCGA_CANCER=$(sed -n ${SGE_TASK_ID}p $TCGA_CANCER_FILE)
INTRON_FILE=$TCGA_ROOT_DIR/$TCGA_CANCER/data.Rdata
OUT_DIR=$TCGA_ROOT_DIR/$TCGA_CANCER

SCRIPT_DIR=/users/tpalmer/splicemute/scripts

$SCRIPT_DIR/save_introns.R -i $INTRON_FILE -o $OUT_DIR

echo $(date)

"""


# Splitting splice junctions per TCGA cancer subtype analyzed per 

## split_introns.R usage

In [None]:

"""

# job submission params
#!/bin/sh
#$ -N create_junc_expr
#$ -S /bin/sh
#$ -l mem_free=10G,h_vmem=10G
#$ -o /save_introns/introns_split.o
#$ -e /save_introns/introns_split.e
#$ -M tpalme15@jhmi.edu
#$ -t 1-15 -tc 15

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

TCGA_ROOT_DIR=/splicemutr_TCGA
TCGA_CANCER_FILE=/splicemutr_TCGA/cancer_dirs.txt # a file containing the one TCGA cancer subtype per line
TCGA_CANCER=$(sed -n ${SGE_TASK_ID}p $TCGA_CANCER_FILE)
INTRON_FILE=$TCGA_ROOT_DIR/$TCGA_CANCER/data.Rdata
OUT_DIR=$TCGA_ROOT_DIR/$TCGA_CANCER
SPLIT_NUM=5000

SCRIPT_DIR=/users/tpalmer/splicemute/scripts

$SCRIPT_DIR/split_introns.R -i $INTRON_FILE -o $OUT_DIR -s $SPLIT_NUM

echo $(date)

"""


# Forming the transcripts using SpliceMutr

## form_transcripts.R usage

In [None]:

"""

# job submission params
#!/bin/sh
#$ -N form_trans
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=10G
#$ -o /form_transcripts/trans_form.o
#$ -e /form_transcripts/trans_form.e
#$ -M tpalme15@jhmi.edu
#$ -t 1-222 -tc 222

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

INTRONS=/splicemutr_TCGA/intron_files.txt # this file is a conglomeration of all intron files for all TCGA cancer subtype
INTRON_FILE=$(sed -n ${SGE_TASK_ID}p $INTRONS)
TCGA_CANCER_DIR=$(dirname $(dirname $(sed -n ${SGE_TASK_ID}p $INTRONS)))
OUT=$TCGA_CANCER_DIR/formed_transcripts
TXDB=/reference/recount3/G026_txdb.sqlite
OUT_PREFIX=$OUT/$(echo $(basename $INTRON_FILE) | sed s/'.rds'/''/g)
SCRIPT_DIR=/splicemute/scripts

$SCRIPT_DIR/form_transcripts.R -o $OUT_PREFIX -t $TXDB -j $INTRON_FILE -b BSgenome.Hsapiens.GENCODE.GRCh38.p10

echo $(date)

"""


# Calculating the coding potential for each SpliceMutr formed transcripts output file

# calc_coding_potential.R usage

In [None]:

"""

# job submission params
#!/bin/sh
#$ -N calc_coding_potential
#$ -S /bin/sh
#$ -l mem_free=2G,h_vmem=2G
#$ -o /calc_coding_potential/cod_pot_calc.o
#$ -e /calc_coding_potential/cod_pot_calc.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-16 -tc 16

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

TCGA_ROOT_DIR=/splicemutr_TCGA
TCGA_CANCER_FILE=/splicemutr_TCGA/cancer_dirs.txt
TCGA_CANCER=$(sed -n ${SGE_TASK_ID}p $TCGA_CANCER_FILE)

OUT=$TCGA_ROOT_DIR/$TCGA_CANCER/formed_transcripts
SPLICE_FILES=$OUT/filenames.txt

START=1
END=$(wc -l $SPLICE_FILES | awk '{print $1}')
for (( VAL=$START; VAL<=$END; VAL++ ))
do

    SPLICE_FILE=$(sed -n ${VAL}p $SPLICE_FILES)
    TRANSCRIPT_FILE=$(echo $SPLICE_FILE | sed s/'_data_splicemutr.rds'/'_sequences.fa'/g)
    FUNCS=/users/tpalmer/splicemute/R/functions.R

    SCRIPT_DIR=/users/tpalmer/splicemute/scripts

    $SCRIPT_DIR/calc_coding_potential.R -o $OUT -s $SPLICE_FILE -t $TRANSCRIPT_FILE -f $FUNCS

done

echo $(date)

"""


# Combining the formed transcripts output from SpliceMutr

## Combine_splicemutr.R usage

In [None]:

"""

# job submission params
#!/bin/sh
#$ -N combine_splicemutr
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=10G
#$ -o /combine_splicemutr/splice_comb_cp.o
#$ -e /combine_splicemutr/splice_comb_cp.e
#$ -M tpalme15@jhmi.edu
#$ -t 1-16 -tc 16

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

TCGA_ROOT_DIR=/splicemutr_TCGA
TCGA_CANCER_FILE=/splicemutr_TCGA/cancer_dirs.txt
TCGA_CANCER=$(sed -n ${SGE_TASK_ID}p $TCGA_CANCER_FILE)

SPLICE_FILES=$TCGA_ROOT_DIR/$TCGA_CANCER/formed_transcripts/filenames_cp.txt
OUT=$TCGA_ROOT_DIR/$TCGA_CANCER/combine_splicemutr_out_cp
mkdir $OUT
SCRIPT_DIR=/splicemute/scripts

$SCRIPT_DIR/combine_splicemutr.R -o $OUT -s $SPLICE_FILES

echo $(date)

"""


# Processing the peptides associated with the SpliceMutr-formed data

## process_peptides.py usage

In [None]:

"""

# job submission params
#!/bin/sh
#$ -N process_peptides
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=5G
#$ -o /process_peptides/peps_proc.o
#$ -e /process_peptides/peps_proc.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-15 -tc 15

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

echo $(date)

SCRIPT_DIR=/splicemute/inst
TCGA_ROOT_DIR=/splicemutr_TCGA
TCGA_CANCER_FILE=/splicemutr_TCGA/cancer_dirs.txt
TCGA_CANCER=$(sed -n ${SGE_TASK_ID}p $TCGA_CANCER_FILE)
PEPTIDES=$TCGA_ROOT_DIR/$TCGA_CANCER/combine_splicemutr_out/proteins.txt
OUT_DIR=$TCGA_ROOT_DIR/$TCGA_CANCER/process_peptides_out
mkdir $OUT_DIR
KMER_LENGTH=9

$SCRIPT_DIR/process_peptides.py -p $PEPTIDES -o $OUT_DIR -k $KMER_LENGTH 

echo $(date)

"""


# Calculating the ic50 score for each HLA allele and splice-junction-modified transcript

## runMHCnuggets.py usage

In [None]:

"""

# job submission params

#!/bin/sh
#$ -N mhcnuggets
#$ -S /bin/sh
#$ -l mem_free=15G,h_vmem=15G
#$ -o /running_mhcnuggets/mhc_run.o
#$ -e /running_mhcnuggets/mhc_run.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-16 -tc 16

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

TCGA_ROOT_DIR=/splicemutr_TCGA
TCGA_CANCER_FILE=/splicemutr_TCGA/cancer_dirs.txt
TCGA_CANCER=$(sed -n ${SGE_TASK_ID}p $TCGA_CANCER_FILE)

TYPE="I"
INPUT_KMERS=$TCGA_ROOT_DIR/$TCGA_CANCER/process_peptides_out/peps_9.txt # an ouput from the previous step
MHC_ALLELE_FILE=$TCGA_ROOT_DIR/$TCGA_CANCER/${TCGA_CANCER}_class1_alleles.txt # this is a file containing all unique class 1 HLA alleles extracted from The Immune Landscape of Cancer Optitype calls 
OUT_DIR=$TCGA_ROOT_DIR/$TCGA_CANCER/mhcnuggets_out
mkdir $OUT_DIR
SCRIPT_DIR=/users/tpalmer/splicemute/inst

$SCRIPT_DIR/runMHCnuggets.py -t $TYPE -k $INPUT_KMERS -m $MHC_ALLELE_FILE -o $OUT_DIR

echo $(date)

"""


# Processing the binding affinity output from MHCnuggets

## process_bindaff.R usage

In [None]:

"""

# job submission params

#!/bin/sh
#$ -N process_bindaff
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=10G
#$ -o /process_bindaff/bindaff_proc.o
#$ -e /process_bindaff/bindaff_proc.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-1999 -tc 100

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

ALLELE_FILES=/splicemutr_TCGA/allele_files.txt
ALLELE=$(sed -n ${SGE_TASK_ID}p $ALLELE_FILES)
CANCER_DIR=$(dirname $(dirname $ALLELE))
OUT_DIR=$CANCER_DIR/process_bindaff_out
if [[ ! -d $OUT_DIR ]]
then
    mkdir $OUT_DIR
fi
BINDERS=$OUT_DIR/$(echo $(basename $ALLELE) | sed 's/.txt/_filt.txt/g')
awk -F "," '{ if ($2 <= 500) { print } }' $ALLELE > $BINDERS
PICKLE_DIR=$CANCER_DIR/process_peptides_out
KMER_LENGTH=9

SCRIPT_DIR=/splicemute/inst

$SCRIPT_DIR/process_bindaff.py -b $BINDERS -p $PICKLE_DIR -o $OUT_DIR -k $KMER_LENGTH

echo $(date)

"""


# Extracting out SpliceMutr data

## extract_data.py usage

In [None]:

"""

# job submission params

#!/bin/sh
#$ -N extract_data
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=10G
#$ -o /extract_data/data_extract.o
#$ -e /extract_data/data_extract.e
#$ -M tpalme15@jhmi.edu
#$ -m ea
#$ -t 1-15 -tc 15

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

TCGA_ROOT_DIR=/splicemutr_TCGA
TCGA_CANCER_FILE=/splicemutr_TCGA/cancer_dirs.txt
MHC_ALLELE_FILE=$TCGA_ROOT_DIR/$TCGA_CANCER/${TCGA_CANCER}_class1_alleles.txt # this is a file containing all unique class 1 HLA alleles extracted from The Immune Landscape of Cancer Optitype calls

LINES_IN_FILE=$(wc -l $MHC_ALLELE_FILE | awk '{print $1}')
START=1

for (( VAL=$START; VAL<=$LINES_IN_FILE); VAL++ ))
do
    ALLELE=$(sed -n ${VAL}p $MHC_ALLELE_FILE)
    PICKLE_DIR=/process_bindaff/process_bindaff_out

    SCRIPT_DIR=/users/tpalmer/splicemute/inst

    $SCRIPT_DIR/extract_data.py -a $ALLELE -p $PICKLE_DIR -b 9 -e 10
done

echo $(date)

"""


# Analyzing the SpliceMutr data for TCGA

## analyze_splicemutr.py usage

The ${CANCER}_genotypes_specific.txt file is a tab-delimited file with the following columns: 
A1      A2      B1      B2      C1      C2      aliquot_id      type    external_id
A1 corresponds to the first HLA-A allele for the sample, B1 corresponds to the first HLA-B allele for the sample, C1 corresponds to the first HLA-C allele for the sample, aliquot_id is to the TCGA barcode, type is whether the sample is a tumor or normal sample, and the external_id is the recount3-specific id for the TCGA sample. This is controlled data, will not be shared, and is generated from the Optitype calls for the TCGA data. 

In [None]:

"""

# job submission params
#!/bin/sh
#$ -N BLCA_analyze_splicemutr
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=10G
#$ -o /analyze_splicemutr/splice_analyze_BLCA.o
#$ -e /analyze_splicemutr/splice_analyze_BLCA.e
#$ -M tpalme15@jhmi.edu
#$ -t 1-433 -tc 100

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

CANCER=BLCA
echo $CANCER $SGE_TASK_ID

ROOT_DIR=/splicemutr_TCGA
GENOTYPES=$ROOT_DIR/$CANCER/${CANCER}_genotypes_specific.txt
SUMMARY_DIR=$ROOT_DIR/$CANCER/process_bindaff_out
SPLICE_DAT_FILE=$ROOT_DIR/$CANCER/combine_splicemutr_out/data_splicemutr_all_pep_nov_corr.txt

OUT_DIR=$ROOT_DIR/$CANCER/analyze_splicemutr_out
SUMMARY_TYPE='IC50'
SCRIPT_DIR=/users/tpalmer/splicemute/inst

$SCRIPT_DIR/analyze_splicemutr.py -g $GENOTYPES -s $SUMMARY_DIR -d $SPLICE_DAT_FILE -o $OUT_DIR -t $SUMMARY_TYPE -n $SGE_TASK_ID

echo $(date)

"""


# Calculating the splicing antigenicity

## calc_gene_metric_len_norm.R usage

In [None]:

"""

# job submission params

#!/bin/sh
#$ -N calc_gene_metric
#$ -S /bin/sh
#$ -l mem_free=5G,h_vmem=10G
#$ -o /calc_gene_metric/gene_calc.o
#$ -e /calc_gene_metric/gene_calc.e
#$ -M tpalme15@jhmi.edu
#$ -t 1-81 -tc 81

echo $(date)

module load conda
source activate /users/tpalmer/miniconda3/envs/R-4.0.2

JUNC_EXPR_FILES=/splicemutr_TCGA/junc_expr_vst.txt
JUNC_EXPR_FILE=$(sed -n ${SGE_TASK_ID}p $JUNC_EXPR_FILES)
FILE=$(echo $(basename $JUNC_EXPR_FILE) | sed "s/.rds//g")
CANCER_DIR_PRE=$(dirname $JUNC_EXPR_FILE)
CANCER_DIR=$(dirname $CANCER_DIR_PRE)

GENE_EXPR_FILE=$CANCER_DIR/gene_expression_vst.rds
SPLICE_DAT_FILE=$CANCER_DIR/combine_splicemutr_out_cp/data_splicemutr_all_pep.rds
KMER_COUNTS=$CANCER_DIR/kmer_counts/all_kmers.txt
OUT_DIR=$CANCER_DIR/GENE_METRIC_CP

if [[ ! -d $OUT_DIR ]]
then
        mkdir $OUT_DIR
fi

OUT_PREFIX=$OUT_DIR/$FILE
SCRIPT_DIR=/splicemute/scripts

$SCRIPT_DIR/calc_gene_metric_len_norm.R -g $GENE_EXPR_FILE -s $SPLICE_DAT_FILE -k $KMER_COUNTS -j $JUNC_EXPR_FILE -o $OUT_PREFIX

echo $(date)

"""
