# Run Fastqc for HiC Samples
- **Author** - Frank Grenn
- **Date Started** - November 2019
- **Quick Description:** code to generate a swarm file to run all samples through fastqc

In [None]:
import os
from os import listdir
from pathlib import Path
import pandas as pd


# Setup for Input File Directory with this format:

```
|-ParentDirectory
|   |-sample1
|   |   |-sample1_R1_001.fastq.gz
|   |   |-sample1_R2_001.fastq.gz
|   |-sample2
|   |   |-sample2_R1_001.fastq.gz
|   |   |-sample2_R2_001.fastq.gz
|   |-sample3
|   |   |-sample3_R1_001.fastq.gz
|   |   |-sample3_R2_001.fastq.gz
        
```


## Output format:

```
|-fastqc
|   |-sample1
|   |   |-sample1_R1_001.fastqc.html
|   |   |-sample1_R1_001.fastqc.zip
|   |   |-sample1_R2_001.fastqc.html
|   |   |-sample1_R2_001.fastqc.zip
|   |-sample2
|   |   |-sample2_R1_001.fastqc.html
|   |   |-sample2_R1_001.fastqc.zip
|   |   |-sample2_R2_001.fastqc.html
|   |   |-sample2_R2_001.fastqc.zip
|   |-sample3
|   |   |-sample3_R1_001.fastqc.html
|   |   |-sample3_R1_001.fastqc.zip
|   |   |-sample3_R2_001.fastqc.html
|   |   |-sample3_R2_001.fastqc.zip
        
```

In [None]:
SAMPLE_DIR="/path/to/ParentDirectory"#where ParentDirectory looks like what is shown above
FASTQC_DIR="/path/to/fastqc"#for the output
SCRIPT_DIR="/path/to/directory/for/scripts"


In [None]:
filelist = listdir(SAMPLE_DIR)

In [None]:
print(filelist)

In [None]:
#only include directories (not files)
sample_directories = [ name for name in os.listdir(SAMPLE_DIR) if os.path.isdir(os.path.join(SAMPLE_DIR, name)) ]
print(len(sample_directories))
print(sample_directories)

In [None]:
#make sample directories
for sample in sample_directories:
    Path("{}/{}".format(FASTQC_DIR,sample)).mkdir(parents=True, exist_ok=True)

In [None]:
#generate the swarm file 
out_file = open("{}/run_fastqc.swarm".format(SCRIPT_DIR),'w')
for sample in sample_directories:
    print("fastqc -o {}/{} -f fastq {}/{}/{}_R1_001.fastq.gz {}/{}/{}_R2_001.fastq.gz \n".format(FASTQC_DIR,sample,SAMPLE_DIR,sample,sample,SAMPLE_DIR,sample,sample),file=out_file)
    
out_file.close()

In [None]:
#run the swarm file on biowulf
print(f"swarm -f {SCRIPT_DIR}/run_fastqc.swarm --module fastqc --time 2-00:00:00")