purpose: run predixcan and FUSION TWAS on externalizing1.0 data

# S-PrediXcan

In [None]:
#!/bin/bash
#SBATCH --job-name predixcan_ext
#SBATCH --partition condo
#SBATCH --qos condo
#SBATCH --nodes 1
#SBATCH -a 1-13 
#SBATCH -c 4
#SBATCH -t 2:00:00
#SBATCH --mem-per-cpu 8G
#SBATCH -o /tscc/nfs/home/bsleger/bsl/SUD_cross_species/job_run_out/predixcan_ext-%j.o
#SBATCH -e /tscc/nfs/home/bsleger/bsl/SUD_cross_species/job_run_out/predixcan_ext-%j.e
#SBATCH --mail-type END,FAIL
#SBATCH --mail-user bsleger@ucsd.edu
#SBATCH --account csd795

#cd /tscc/projects/ps-palmer/brittany/MetaXcan/GTEx/brain_models/
db_list=(`ls *.db`) #len=13 change back for rerun all

#failed_rerun=( en_Brain_Caudate_basal_ganglia.db en_Brain_Cerebellar_Hemisphere.db en_Brain_Cerebellum.db en_Brain_Cortex.db en_Brain_Hypothalamus.db en_Brain_Putamen_basal_ganglia.db )

cd /tscc/projects/ps-palmer/brittany/MetaXcan/
source activate imlabtools

m=${db_list[$SLURM_ARRAY_TASK_ID-1]}
#m=${failed_rerun[$SLURM_ARRAY_TASK_ID-1]}
echo $m

software/SPrediXcan.py \
--model_db_path  "GTEx/brain_models/"$m \
--covariance GTEx/gtex_v8_expression_elastic_net_snp_smultixcan_covariance.txt.gz \
--gwas_file /tscc/projects/ps-palmer/brittany/SUD_cross_species/ext_sumstat_2019/FINAL.EXT_COMMON_FACTOR.EXTERNALIZING.20191014.PREPARED.wFREQ.A1.txt.gz \
--snp_column SNP \
--effect_allele_column A1 \
--non_effect_allele_column A2 \
--beta_column BETA.A1 \
--pvalue_column P \
--output_file "results/predixcan_externalizing2019_"${m##*/}".csv"


# FUSION association

In [None]:
#decompress all of the GTEx models 
cd "${FUS_PATH}/WEIGHTS"
for f in *.tar.bz2; do
    tar xjf $f 
done

The primary input is genome-wide summary statistics in LD-score format. At minimum, this is a flat file with a header row containing the following fields:

SNP – SNP identifier (rsID)
A1 – first allele (effect allele)
A2 – second allele (other allele)
Z – Z-scores, sign with respect to A1.

## run fusion- script

In [None]:
#!/bin/bash
#SBATCH --job-name ext_FUSION
#SBATCH --partition condo
#SBATCH --qos condo
#SBATCH -a 1-9
#SBATCH --time 2:00:00
#SBATCH --nodes 1
#SBATCH --cpus-per-task 4
#SBATCH --mem-per-cpu 4G
#SBATCH -o /tscc/nfs/home/bsleger/bsl/SUD_cross_species/job_run_out/ext_FUSION-%j.o
#SBATCH -e /tscc/nfs/home/bsleger/bsl/SUD_cross_species/job_run_out/ext_FUSION-%j.e
#SBATCH --mail-type END,FAIL
#SBATCH --mail-user bsleger@ucsd.edu
#SBATCH --account csd795



FUS_PATH='/tscc/nfs/home/bsleger/bsl/fusion_twas-master/'
SUD_PATH='/tscc/projects/ps-palmer/brittany/SUD_cross_species/'
DATA_FILE='ext_sumstat_2019/FINAL.EXT_COMMON_FACTOR.EXTERNALIZING.20191014.PREPARED.wFREQ.A1.txt'
OUT_PATH="${SUD_PATH}ext_FUSION/"
OUT_PREF="ext2019"


cd "${FUS_PATH}WEIGHTS"
db_list=(`ls GTEx.Brain*.pos`)
 echo ${#db_list[*]} 
#db list ls 9 long - make job array that's 1-9

cd $FUS_PATH

source activate lzenv

TISSUE=${db_list[$SLURM_ARRAY_TASK_ID-1]}
#m=${db_list[1]}
echo $TISSUE


for ((CHR = 1; CHR < 23; CHR++));
do
    echo $CHR
    OUT=${OUT_PATH}${OUT_PREF}_${TISSUE}_${CHR}.dat
    echo $OUT
    Rscript FUSION.assoc_test.R \
    --sumstats $SUD_PATH$DATA_FILE \
    --weights "./WEIGHTS/"${TISSUE} \
    --weights_dir ./WEIGHTS/ \
    --ref_ld_chr ./LDREF/1000G.EUR. \
    --chr $CHR \
    --out $OUT
done

## concat results together

### Python- get set of prefixes 

In [1]:
import pandas as pd
import os

In [2]:
os.chdir("/tscc/projects/ps-palmer/brittany/SUD_cross_species/ext_FUSION")

In [3]:
files=os.listdir()

In [8]:
files
if 'FUSION_concat' in files:
    files.remove('FUSION_concat')

In [13]:
prefixes=set(map(lambda string: string.split(".")[0]+'.'+string.split(".")[1], files))
if '.ipynb_checkpoints' in prefixes:
    prefixes.remove('.ipynb_checkpoints')

In [15]:
str='( '
for f in prefixes:
    str=str+"'"+f+"' "
    
str=str[0:len(str)-1]+' )'
print(str)

( 'ext2019_GTEx.Brain_Caudate_basal_ganglia' 'ext2019_GTEx.Brain_Cortex' 'ext2019_GTEx.Brain_Putamen_basal_ganglia' 'ext2019_GTEx.Brain_Hippocampus' 'ext2019_GTEx.Brain_Nucleus_accumbens_basal_ganglia' 'ext2019_GTEx.Brain_Hypothalamus' 'ext2019_GTEx.Brain_Cerebellum' 'ext2019_GTEx.Brain_Frontal_Cortex_BA9' 'ext2019_GTEx.Brain_Cerebellar_Hemisphere' )


### bash concat files together

In [1]:
#bash
#run to concat all the files 
cd /tscc/projects/ps-palmer/brittany/SUD_cross_species/ext_FUSION
#make directory for concated files
if [ ! -d "FUSION_concat" ]; then
  mkdir FUSION_concat

fi

prefixes=( 'ext2019_GTEx.Brain_Caudate_basal_ganglia' 'ext2019_GTEx.Brain_Cortex' 'ext2019_GTEx.Brain_Putamen_basal_ganglia' 'ext2019_GTEx.Brain_Hippocampus' 'ext2019_GTEx.Brain_Nucleus_accumbens_basal_ganglia' 'ext2019_GTEx.Brain_Hypothalamus' 'ext2019_GTEx.Brain_Cerebellum' 'ext2019_GTEx.Brain_Frontal_Cortex_BA9' 'ext2019_GTEx.Brain_Cerebellar_Hemisphere' )

for p in ${prefixes[@]}; do
    for CHR in {1..22}; do
        # Define the file
        file="${p}.pos_${CHR}.dat"
        echo $file
        if [ ${CHR} -eq 1 ];
        then
            cp $file FUSION_concat/${p}.dat
        else
            awk 'FNR>1' $file >> FUSION_concat/${p}.dat
            
        fi
        if [ $CHR -eq 6 ];
        then
            file="${p}.pos_${CHR}.dat.MHC"
            echo $file
            awk 'FNR>1' $file >> FUSION_concat/${p}.dat

        fi   
    done
done

# makes files  /tscc/projects/ps-palmer/brittany/SUD_cross_species/nicsa_gwas/results/gwas/mlma_concat/regressedlr_cigday_pc1.mlma
# /tscc/projects/ps-palmer/brittany/SUD_cross_species/nicsa_gwas/results/gwas/mlma_concat/regressedlr_cigday_pc1.mlma

ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_1.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_2.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_3.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_4.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_5.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_6.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_6.dat.MHC
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_7.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_8.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_9.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_10.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_11.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_12.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_13.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_14.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_15.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_16.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_17.dat
ext2019_GTEx.Brain_Caudate_basal_ganglia.pos_18.dat
ext2019_GTEx.Brain

predixcan and fusion results compared in a different notebook :) 

SUD_cross_species/scripts/TWAS_FUSION_predixcan_comparison_human_rat.ipynb

# rat fusion

## decompress input files- BASH (only run once)

In [None]:
#decompress input files

tissues=( 'Adipose'  'BLA'  'Brain'  'Eye'  'IL'  'LHb'  'Liver'  'NAcc'  'NAcc1'  'NAcc2'  'OFC'  'PL'  'PL1'  'PL2' )
for d in ${tissues[@]}; do    
echo $d
cd $d
    for f in *.tar.bz2; do
        tar xjf $f 
    done
cd ..
done

## calculate z-scores rat gwas sumstats- python env-std-py38

In [1]:
import os
import pandas as pd

In [2]:
os.chdir('/tscc/projects/ps-palmer/brittany/SUD_cross_species/')

In [3]:
# first run for meta analysis- same as Daniel ran, compare to check that method is the same
t=pd.read_csv('loco_meta/regressedlr_combined_locomotor_sumstat.mlma',sep='\t')
t.columns=t.columns.str.upper()
t['SNP']='chr'+t['SNP']
t['Z']=t['B']/t['SE']
#t.to_csv('loco_meta/regressedlr_combined_locomotor_sumstat_zscores.mlma',sep='\t',index=False)

In [None]:
#first run of multivariate analysis
t=pd.read_csv('loco_commonFactor/combined_gwas_results_CF_MPH.txt',sep='\t')
t['Z']=t['B']/t['SE']
#t.to_csv('loco_commonFactor/combined_gwas_results_CF_MPH_zscores.txt',sep='\t')

In [3]:
#final run of multivariate analysis
t=pd.read_csv('loco_final/mlma_concat/regressedlr_gsem_results_commonfactor_F1_common_chrgwas.mlma',sep='\t')
t.columns=t.columns.str.upper()
t['SNP']='chr'+t['SNP']
t['Z']=t['B']/t['SE']
t.to_csv('loco_final/mlma_concat/regressedlr_gsem_results_commonfactor_F1_common_chrgwas_zscores.mlma',sep='\t',index=False)

In [13]:
#final run mega analysis
t=pd.read_csv('loco_final/mlma_concat/regressedlr_locomotor_mega_chrgwas.mlma',sep='\t')
t.columns=t.columns.str.upper()
t['SNP']='chr'+t['SNP']
t['Z']=t['B']/t['SE']
t.to_csv('loco_final/mlma_concat/regressedlr_locomotor_mega_chrgwas_zscores.mlma',sep='\t')

In [5]:
# BMI from 2023 paper- this was with RN6 so have to use different references- need to ask daniel for gene weights
t=pd.read_csv('~/bsl/bmi_ucsd_lib/results.bmi_wo_tail.csv')
t.columns=t.columns.str.upper()
t['Z']=t['B']/t['SE']
t.to_csv('rat_ctrl/bmi_wo_tail.txt',sep='\t',index=False)

In [8]:
t

Unnamed: 0,CHR,SNP,BP,A1,A2,FREQ,B,SE,P,Z
0,10,chr10:142747,142747,T,C,0.887077,-0.061692,0.033176,0.062953,-1.859522
1,10,chr10:143014,143014,T,C,0.887077,-0.061692,0.033176,0.062953,-1.859522
2,10,chr10:143363,143363,T,C,0.887077,-0.061692,0.033176,0.062953,-1.859522
3,10,chr10:143699,143699,G,C,0.887214,-0.061704,0.033184,0.062962,-1.859456
4,10,chr10:143715,143715,C,T,0.887214,-0.061704,0.033184,0.062962,-1.859456
...,...,...,...,...,...,...,...,...,...,...
3513489,9,chr9:121994902,121994902,A,C,0.480107,0.002695,0.020213,0.893928,0.133335
3513490,9,chr9:121998393,121998393,A,G,0.480107,0.002695,0.020213,0.893928,0.133335
3513491,9,chr9:122022381,122022381,T,G,0.010700,0.077545,0.085786,0.366029,0.903935
3513492,9,chr9:122022385,122022385,A,T,0.010789,0.092433,0.085443,0.279339,1.081805


In [7]:
ldref=pd.read_csv('rat_fusion/LDREF/Brain_rn7.9.bim',sep='\t',header=None)

In [16]:
#check if the SNP IDs match between references
len(set(ldref[1]).intersection(t.SNP))/len(set(ldref[1]))

0.9924505696489831

In [18]:
wt=pd.read_csv('rat_fusion/twas-weights-rn7/NAcc/expression.pos',sep='\t')

In [20]:
wt[wt.CHR==9]

Unnamed: 0,WGT,ID,CHR,P0,P1,N
612,expression/ENSRNOG00000003242.wgt.RDat,ENSRNOG00000003242,9,46622668,46622669,270
693,expression/ENSRNOG00000003825.wgt.RDat,ENSRNOG00000003825,9,47903199,47903200,270
740,expression/ENSRNOG00000004076.wgt.RDat,ENSRNOG00000004076,9,48253409,48253410,270
850,expression/ENSRNOG00000004719.wgt.RDat,ENSRNOG00000004719,9,6533638,6533639,270
1737,expression/ENSRNOG00000010174.wgt.RDat,ENSRNOG00000010174,9,16887738,16887739,270
...,...,...,...,...,...,...
6225,expression/ENSRNOG00000070018.wgt.RDat,ENSRNOG00000070018,9,103827363,103827364,270
6229,expression/ENSRNOG00000070049.wgt.RDat,ENSRNOG00000070049,9,113866731,113866732,270
6247,expression/ENSRNOG00000070300.wgt.RDat,ENSRNOG00000070300,9,113378366,113378367,270
6299,expression/ENSRNOG00000071060.wgt.RDat,ENSRNOG00000071060,9,106093754,106093755,270


## run fusion- bash

In [57]:
conda activate lzenv
 
TISSUE='NAcc'
VAR_TYPE='expression'

FUS_PATH='/tscc/nfs/home/bsleger/bsl/fusion_twas-master/'

cd $FUS_PATH
SUD_PATH='/tscc/projects/ps-palmer/brittany/SUD_cross_species/'
OUT_PATH="${SUD_PATH}rat_fusion/output/"
OUT_PREF='loco_meta'
DATA_FILE='loco_meta/regressedlr_combined_locomotor_sumstat_zscores.mlma'
FUS_REF_PATH=${SUD_PATH}'rat_fusion/'
WEIGHTS_PATH=$FUS_REF_PATH"twas-weights-rn7/"${TISSUE}

for ((CHR = 1; CHR < 21; CHR++)); do
    OUT=${OUT_PATH}${OUT_PREF}_${TISSUE}_${CHR}.dat
    echo $OUT
    Rscript FUSION.assoc_test.R \
    --sumstats $SUD_PATH$DATA_FILE \
    --weights ${WEIGHTS_PATH}/${VAR_TYPE}'.pos' \
    --weights_dir $WEIGHTS_PATH \
    --ref_ld_chr $FUS_REF_PATH/LDREF/Brain_rn7. \
    --chr $CHR \
    --out $OUT
done

In [37]:
for ((CHR = 1; CHR < 22; CHR++)); do
    OUT=${OUT_PATH}${OUT_PREF}_${TISSUE}_${CHR}.dat
    echo $OUT
    Rscript FUSION.assoc_test.R \
    --sumstats $SUD_PATH$DATA_FILE \
    --weights ${WEIGHTS_PATH}/${VAR_TYPE}'.pos' \
    --weights_dir $WEIGHTS_PATH \
    --ref_ld_chr $FUS_REF_PATH/LDREF/Brain_rn7. \
    --chr $CHR \
    --out $OUT
done

bash: Rscript: command not found


: 127

## concat files

In [1]:
#bash
#run to concat all the files 
cd /tscc/projects/ps-palmer/brittany/SUD_cross_species/rat_fusion/output
#make directory for concated files
if [ ! -d "FUSION_concat" ]; then
  mkdir FUSION_concat

fi

#prefixes=( 'loco_gsem_NAcc' 'loco_gsem_NAcc1' 'loco_gsem_NAcc2' )
prefixes=( 'loco_final_mega' )
db_list=( 'Adipose'  'BLA'  'Brain'  'Eye'  'IL'  'LHb'  'Liver'  'NAcc'  'NAcc1'  'NAcc2'  'OFC'  'PL'  'PL1'  'PL2' )


for tissue in ${db_list[@]}; do
    echo $tissue
    for pref in ${prefixes[@]}; do
        echo $pref
        p=${pref}_${tissue}
        echo $p
        for CHR in {1..20}; do
            # Define the file
            file="${p}_${CHR}.dat"
            echo $file
            if [ ${CHR} -eq 1 ];
            then
                cp $file FUSION_concat/${p}.dat
            else
                awk 'FNR>1' $file >> FUSION_concat/${p}.dat 
            fi
            if [ $CHR -eq 6 ];
            then
                file="${p}_${CHR}.dat.MHC"
                echo $file
                awk 'FNR>1' $file >> FUSION_concat/${p}.dat
    
            fi   
        done
    done
done

SyntaxError: invalid syntax (713046142.py, line 5)