### Gene Expression
__Versions__  
Bedtools: 2.29.0   
R: 3.6.1  
DEseq 1.26.0  
GNU Awk 4.0.2   
GNU grep 2.20   
GNU Coreutils 8.22   

##### Pipeline to compare gene expression orthologous and non orthologous genes in two species. 
(Figure 6)

In [None]:
# GET DMEL TAD COLUMNS FOR CHROM, START, AND STOP
cut -f 4-6 /dmel_to_dtri_domains/dtri_lo_dmel_0.9r_final | sort -k1,1 -k2,2n | uniq > t1
cut -f 5-7 /dmel_to_dtri_domains/missing_split | sort -k1,1 -k2,2n | uniq > t2
cut -f 4-6 /dmel_to_dtri_domains/dtri_lo_dmel_NOT0.9_contig | sort -k1,1 -k2,2n | uniq > t3
cut -f 7-9 /dmel_to_dtri_domains/dtri_lo_dmel_tande | sort -k1,1 -k2,2n | uniq > t4

# GET RID OF ANY DUPLICATE ENTRIES AND COMBINE THE 3 NONCONSERVED TAD FILES
cat t1 | sort -k1,1 -k2,2n | uniq > dtri_lo_dmel_0.9r_final.bed
cat t2 t3 t4 | sort -k1,1 -k2,2n | uniq > dtri_lo_dmel_NOT_0.9r_final.bed

# GET GENES THAT OVERLAP CONSERVED AND NONCONSERVED DOMAINS
bedtools intersect -u -a dmel-all-r6.21.proteinCoding.MullerIDs.bed -b dtri_lo_dmel_0.9r_final.bed > dtri_lo_dmel_0.9r_final.GENES.bed
bedtools intersect -u -a dmel-all-r6.21.proteinCoding.MullerIDs.bed -b dtri_lo_dmel_NOT_0.9r_final.bed > dtri_lo_dmel_NOT_0.9r_final.GENES.bed

# MAKE NEW FILE WITH ONLY GENE IDs
cut -f 4 dtri_lo_dmel_0.9r_final.GENES.bed | sort | uniq > dtri_lo_dmel_0.9r_final.GENES.IDs
cut -f 4 dtri_lo_dmel_NOT_0.9r_final.GENES.bed | sort | uniq > dtri_lo_dmel_NOT_0.9r_final.GENES.IDs

# COUNT HOW MANY 1-1 ORTHOLOGS ARE IN CONSERVED VS NONCONSERVED DOMAINS
grep -f dtri_lo_dmel_0.9r_final.GENES.IDs dtri_dmel.1to1_orthologs | wc -l
grep -f dtri_lo_dmel_NOT_0.9r_final.GENES.IDs dtri_dmel.1to1_orthologs | wc -l

# USE DESEQ OUTPUT TO CREATE TWO FILES: ONE FOR CONSERVED DOMAIN GENES AND ONE FOR NONCONSERVED
head -1 deseq_output_all.csv > t1
head -1 deseq_output_all.csv > t2
grep -f dtri_lo_dmel_0.9r_final.GENES.IDs deseq_output_all.csv >> t1
grep -f dtri_lo_dmel_NOT_0.9r_final.GENES.IDs deseq_output_all.csv >> t2

# COUNT THE NUMBER OF CONSERVED AND NONCONSERVED DIFFERENTIALLY EXPRESSED GENES WITH ADJUSTED PVALUE<=0.05
awk -F, '{print $7}' t1 | grep -v padj | grep -v NA | awk '$1<=0.05' | wc -l
awk -F, '{print $7}' t2 | grep -v padj | grep -v NA | awk '$1<=0.05' | wc -l

Fisher's Exact Test to compare differentially expressed genes between conserved and nonconserved TADs.

In [None]:
matrix(c(99,1092-99,501,4739-501),nrow=2)->m
fisher.test(m)

#### Calculate the observed and expected fraction of DE genes within 10kb of lineage-specific boundaries 
(Figure 4)

In [None]:
# Combine both sets of lineage specific boundaries in a single file (ie those from dmel and those from dtri)
# Both sets should be in dmel coordinates
cat dmel_to_dtri_dmel_nonconserved_coords_with_bound_id dtri_to_dmel_dmel_non_boundary_coord_with_bound_id | sort -k1,1 -k2,2n | bedtools merge -i - > lineage_specific_all.bed

# Get IDs of DE genes (P<=0.05) and count them
awk -F\, '$7!="NA" && $7<=0.05{print $1}' ../expression/deseq_output_all.NEW.csv | grep FBgn | sed -r 's/\"//g' > DE.ids
wc -l DE.ids

# Get coordinates of DE genes and count how many of them are within 10kb from a lineage-specific boundary
grep -f DE.ids dmel-all-r6.21.genes.MullerIDs.bed | sort -k1,1 -k2,2n | bedtools closest -d -a - -b lineage_specific_all.bed | awk '$10<10000 && $10>=0' | cut -f 4 | sort | uniq | wc -l

# Randomly sample 964 genes and ask how many are within 10 kb of lineage-specific boundaries 1000X
# Use this to get expected number
rm lineage_specific_all.random
for i in {1..1000}
do
    grep Muller dmel-all-r6.21.genes.MullerIDs.bed | shuf -n 964 | sort -k1,1 -k2,2n | bedtools closest -d -a - -b lineage_specific_all.bed | awk '$10<10000 && $10>=0' | cut -f 4 | sort | uniq | wc -l >> lineage_specific_all.random
done

# Find total number of genes within 10kb of LS boundaries
cat dmel-all-r6.21.genes.MullerIDs.bed | sort -k1,1 -k2,2n | bedtools closest -d -a - -b lineage_specific_all.bed | awk '$10<10000 && $10>=0' | cut -f 4 | sort | uniq | wc -l

#### Calculate the observed and expected fraction of DE genes within 10kb of lineage-specific breakpoints
(Figure 4)

In [None]:
# Combine both sets of lineage specific boundaries in a single file (ie those from dmel and those from dtri)
# Both sets should be in dmel coordinates
cat dmel_to_dtri_dmel_nonconserved_coords_with_bound_id dtri_to_dmel_dmel_non_boundary_coord_with_bound_id | sort -k1,1 -k2,2n | bedtools merge -i - > lineage_specific_all.bed

# Get IDs of DE genes (P<=0.05) and count them
awk -F\, '$7!="NA" && $7<=0.05{print $1}' ../expression/deseq_output_all.NEW.csv | grep FBgn | sed -r 's/\"//g' > DE.ids
wc -l DE.ids

# Get coordinates of DE genes and count how many of them are within 10kb from a lineage-specific boundary
grep -f DE.ids dmel-all-r6.21.genes.MullerIDs.bed | sort -k1,1 -k2,2n | bedtools closest -d -a - -b lineage_specific_all.bed | awk '$10<10000 && $10>=0' | cut -f 4 | sort | uniq | wc -l

# Randomly sample 964 genes and ask how many are within 10 kb of lineage-specific boundaries 1000X
# Use this to get expected number
rm lineage_specific_all.random
for i in {1..1000}
do
    grep Muller dmel-all-r6.21.genes.MullerIDs.bed | shuf -n 964 | sort -k1,1 -k2,2n | bedtools closest -d -a - -b lineage_specific_all.bed | awk '$10<10000 && $10>=0' | cut -f 4 | sort | uniq | wc -l >> lineage_specific_all.random
done

# Find total number of genes within 10kb of LS boundaries
cat dmel-all-r6.21.genes.MullerIDs.bed | sort -k1,1 -k2,2n | bedtools closest -d -a - -b lineage_specific_all.bed | awk '$10<10000 && $10>=0' | cut -f 4 | sort | uniq | wc -l
(base) [nt365@amarel2 ellison_commands]$ cat fraction_DE.breakpoints.sh 
# Exclude boundaries from braekpoint file
bedtools intersect -v -a dmel_breakpoints -b dmel_merge_mid5000 dmel_lc > dmel_breakpoints_no_boundaries

# Get IDs of DE genes (P<=0.05) and count them
awk -F\, '$7!="NA" && $7<=0.05{print $1}' ../expression/deseq_output_all.NEW.csv | grep FBgn | sed -r 's/\"//g' > DE.ids
wc -l DE.ids

# Get coordinates of DE genes and count how many of them are within 10kb from a lineage-specific boundary
grep -f DE.ids dmel-all-r6.21.genes.MullerIDs.bed | sort -k1,1 -k2,2n | bedtools closest -d -a - -b dmel_breakpoints_no_boundaries | awk '$10<10000 && $10>=0' | cut -f 4 | sort | uniq | wc -l

# Randomly sample 964 genes and ask how many are within 10 kb of lineage-specific boundaries 1000X
# Use this to get expected number
rm dmel_breakpoints_no_boundaries.random
for i in {1..1000}
do
    grep Muller dmel-all-r6.21.genes.MullerIDs.bed | shuf -n 964 | sort -k1,1 -k2,2n | bedtools closest -d -a - -b dmel_breakpoints_no_boundaries | awk '$10<10000 && $10>=0' | cut -f 4 | sort | uniq | wc -l >> dmel_breakpoints_no_boundaries.random
done

# Find total number of genes within 10kb of LS boundaries
cat dmel-all-r6.21.genes.MullerIDs.bed | sort -k1,1 -k2,2n | bedtools closest -d -a - -b dmel_breakpoints_no_boundaries | awk '$10<10000 && $10>=0' | cut -f 4 | sort | uniq | wc -l
