# Fastq file preprocessing

In [3]:
data_dir = 'Data/Fastq_Data_preprocess'

from Bio import SeqIO
# Import Biopython.SeqIO for reading reading sequence files

import gzip
# Import gzip for reading compressed files, in this case, the files is not compressed

import os
# Import os for reading files and interact in directory

for filename in os.listdir(data_dir):
    if filename.endswith(".fastq"):  # Look for file that ends with .fastq in the base dir
        file_path = os.path.join(data_dir, filename)
        print(f"Processing: {file_path}")

        with open(file_path, "rt") as handle:
            r = 0 # Initialize counter for record number
            for record in SeqIO.parse(handle, "fastq"):
                r +=1
                # ID, sequence and quality can be print, but im just gonna look for record number
                #print(f"ID: {record.id}")
                #print(f"Sequence: {record.seq}")
                #print(f"Quality: {record.letter_annotations['phred_quality']}\n")

            print('Record Number =', r)


Processing: Data/Fastq_Data_preprocess/SRR5309277.fastq
Record Number = 26988845
Processing: Data/Fastq_Data_preprocess/SRR5309286.fastq
Record Number = 34658939


In [5]:
import subprocess
# Import subprocess for running external programs in terminal, I dont like using UNIX, so I use this alternative

output_dir = "Data/Fastq_Data_preprocess/Result"
# Create a dir for the output files


for filename in os.listdir(data_dir):
    if filename.endswith(".fastq"):  
        # Again look for fastq file
        file_path = os.path.join(data_dir, filename)
        print(f"Processing: {file_path}")
        subprocess.run(["fastqc", file_path, "-o", output_dir], check=True)
        # and run fastqc in the terminal, for the file path and output to  output dir

Processing: Data/Fastq_Data_preprocess/SRR5309277.fastq
null


Started analysis of SRR5309277.fastq
Approx 5% complete for SRR5309277.fastq
Approx 10% complete for SRR5309277.fastq
Approx 15% complete for SRR5309277.fastq
Approx 20% complete for SRR5309277.fastq
Approx 25% complete for SRR5309277.fastq
Approx 30% complete for SRR5309277.fastq
Approx 35% complete for SRR5309277.fastq
Approx 40% complete for SRR5309277.fastq
Approx 45% complete for SRR5309277.fastq
Approx 50% complete for SRR5309277.fastq
Approx 55% complete for SRR5309277.fastq
Approx 60% complete for SRR5309277.fastq
Approx 65% complete for SRR5309277.fastq
Approx 70% complete for SRR5309277.fastq
Approx 75% complete for SRR5309277.fastq
Approx 80% complete for SRR5309277.fastq
Approx 85% complete for SRR5309277.fastq
Approx 90% complete for SRR5309277.fastq
Approx 95% complete for SRR5309277.fastq


Analysis complete for SRR5309277.fastq
Processing: Data/Fastq_Data_preprocess/SRR5309286.fastq
null


Started analysis of SRR5309286.fastq
Approx 5% complete for SRR5309286.fastq
Approx 10% complete for SRR5309286.fastq
Approx 15% complete for SRR5309286.fastq
Approx 20% complete for SRR5309286.fastq
Approx 25% complete for SRR5309286.fastq
Approx 30% complete for SRR5309286.fastq
Approx 35% complete for SRR5309286.fastq
Approx 40% complete for SRR5309286.fastq
Approx 45% complete for SRR5309286.fastq
Approx 50% complete for SRR5309286.fastq
Approx 55% complete for SRR5309286.fastq
Approx 60% complete for SRR5309286.fastq
Approx 65% complete for SRR5309286.fastq
Approx 70% complete for SRR5309286.fastq
Approx 75% complete for SRR5309286.fastq
Approx 80% complete for SRR5309286.fastq
Approx 85% complete for SRR5309286.fastq
Approx 90% complete for SRR5309286.fastq
Approx 95% complete for SRR5309286.fastq


Analysis complete for SRR5309286.fastq


In [6]:
subprocess.run(["multiqc", output_dir], check=True) 
# Run multiqc in the terminal for the output dir, to generate a report of the fastqc results


[91m///[0m ]8;id=419611;https://multiqc.info\[1mMultiQC[0m]8;;\ 🔍 [2mv1.27.1[0m

[34m     version_check[0m | [33mMultiQC Version v1.28 now available![0m
[34m       file_search[0m | Search path: /home/chongcfu/Bioinformatics project/Data/Fastq_Data_preprocess/Result
[2K         [34msearching[0m | [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [35m100%[0m [32m4/4[0m  0/4[0m  
[?25h[34m            fastqc[0m | Found 2 reports
[34m     write_results[0m | Data        : multiqc_data
[34m     write_results[0m | Report      : multiqc_report.html
[34m           multiqc[0m | MultiQC complete


CompletedProcess(args=['multiqc', 'Data/Fastq_Data_preprocess/Result'], returncode=0)

In [None]:
import subprocess


for filename in os.listdir(data_dir):
    if filename.endswith(".fastq"):  
        file_path = os.path.join(data_dir, filename)

        print(f"Processing: {file_path}")
        output_file = file_path.lstrip(data_dir)
        output_file = 'Data/Fastq_Data_preprocess/Trimmed'+output_file

        subprocess.run([
            "trimmomatic", "SE", "-phred33",  # Single-End, Phred score 33
            file_path, output_file,
            "LEADING:3", "TRAILING:3",  # Trim low-quality bases at start and end
            "SLIDINGWINDOW:4:15",  # Remove if average quality < 15 in a 4-base window
            "MINLEN:36"  # Remove reads shorter than 36bp
        ], check=True)


Processing: Data/Fastq_Data_preprocess/SRR5309277.fastq


TrimmomaticSE: Started with arguments:
 -phred33 Data/Fastq_Data_preprocess/SRR5309277.fastq Data/Fastq_Data_preprocess/TrimmedSRR5309277.fastq LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
Automatically using 1 threads
Input Reads: 26988845 Surviving: 26968211 (99.92%) Dropped: 20634 (0.08%)
TrimmomaticSE: Completed successfully


Processing: Data/Fastq_Data_preprocess/SRR5309286.fastq


TrimmomaticSE: Started with arguments:
 -phred33 Data/Fastq_Data_preprocess/SRR5309286.fastq Data/Fastq_Data_preprocess/TrimmedSRR5309286.fastq LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36
Automatically using 1 threads
