In [8]:
import os
os.getcwd()
try: 
    os.mkdir("../analysis/")
except FileExistsError:
    print("Directory already exists")
os.chdir("../analysis")
os.getcwd()

Directory already exists


'/home/xie186/github/UMD_BIOI611_lab/analysis'

In [1]:
%%bash

/home/xie186/github/UMD_BIOI611_lab/notebook


## Download reference genome 

To download the reference for this lab, we use [ENSEMBL database](https://useast.ensembl.org/Caenorhabditis_elegans/Info/Index). 
In ENSEMBL database, each species may have different releases of genome build. We use `release-111` in this project. 

The genome sequences can be obtained from the link below:
https://ftp.ensembl.org/pub/release-111/fasta/caenorhabditis_elegans/dna/

The genoe anntation file in gtf format can be obtained here: 
https://ftp.ensembl.org/pub/release-111/gtf/caenorhabditis_elegans/


In [8]:
%%bash
mkdir -p reference/ 
    wget -O reference/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz -nv https://ftp.ensembl.org/pub/release-111/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz
gunzip reference/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz

2024-05-11 20:47:10 URL:https://ftp.ensembl.org/pub/release-111/fasta/caenorhabditis_elegans/dna/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz [30316631/30316631] -> "Caenorhabditis_elegans.WBcel235.dna.toplevel.fa.gz" [1]


In [None]:
%%bash
## A *fai file will be generated
samtools faidx reference/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa 

In [27]:
%%bash

wget -O reference/Caenorhabditis_elegans.WBcel235.111.gtf.gz -nv https://ftp.ensembl.org/pub/release-111/gtf/caenorhabditis_elegans/Caenorhabditis_elegans.WBcel235.111.gtf.gz
gunzip reference/Caenorhabditis_elegans.WBcel235.111.gtf.gz

2024-05-11 21:03:37 URL:https://ftp.ensembl.org/pub/release-111/gtf/caenorhabditis_elegans/Caenorhabditis_elegans.WBcel235.111.gtf.gz [8730028/8730028] -> "Caenorhabditis_elegans.WBcel235.111.gtf.gz" [1]


## Build reference genome 


In [36]:
%%bash
STAR --runThreadN 23 --runMode genomeGenerate --genomeDir STAR_ref \
         --genomeFastaFiles reference/Caenorhabditis_elegans.WBcel235.dna.toplevel.fa \
         --sjdbGTFfile reference/Caenorhabditis_elegans.WBcel235.111.gtf \
         --genomeSAindexNbases 12 

In [38]:
%%bash
ls -sh STAR_ref/

total 2.6G
4.0K chrLength.txt
4.0K chrNameLength.txt
4.0K chrName.txt
4.0K chrStart.txt
7.9M exonGeTrInfo.tab
3.2M exonInfo.tab
1.6M geneInfo.tab
119M Genome
4.0K genomeParameters.txt
8.0K Log.out
973M SA
1.5G SAindex
2.9M sjdbInfo.txt
3.0M sjdbList.fromGTF.out.tab
2.4M sjdbList.out.tab
3.1M transcriptInfo.tab


## Download the reads 


| SRR_ID       | Sample_name   |
|--------------|---------------|
| SRR15694101  | N2_day7_rep2  |
| SRR15694102  | N2_day7_rep1  |
| SRR15694100  |  N2_day7_rep3 |
| SRR15694099  | N2_day1_rep1  |
| SRR15694098  | N2_day1_rep2  |
| SRR15694097  | N2_day1_rep3  |

In [None]:
%%bash
mkdir -p raw_data/ 
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR156/002/SRR15694102/SRR15694102.fastq.gz -o raw_data/N2_day7_rep1.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR156/001/SRR15694101/SRR15694101.fastq.gz -o raw_data/N2_day7_rep2.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR156/000/SRR15694100/SRR15694100.fastq.gz -o raw_data/N2_day7_rep3.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR156/099/SRR15694099/SRR15694099.fastq.gz -o raw_data/N2_day1_rep1.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR156/098/SRR15694098/SRR15694098.fastq.gz -o raw_data/N2_day1_rep2.fastq.gz
curl -L ftp://ftp.sra.ebi.ac.uk/vol1/fastq/SRR156/097/SRR15694097/SRR15694097.fastq.gz -o raw_data/N2_day1_rep3.fastq.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
 16 4123M   16  684M    0     0   655k      0  1:47:22  0:17:49  1:29:33  519k
curl: (18) transfer closed with 3605679051 bytes remaining to read
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed


## Check the read quality 


In [None]:
%%bash
ls

## Clean up data 

In [None]:
%%bash
# Clean up the folders
cleanup=True
if [ cleanup ]; then
    rm -rf Caenorhabditis_elegans.WBcel235.dna.toplevel.fa* Caenorhabditis_elegans.WBcel235.111.gtf*\
    rm -rf STAR_ref 
    rm -rf raw_data/ 
fi

In [None]:
%%bash
