# Data Extraction from FastQC Results

#### Import fastqQCParser   
https://pypi.org/project/fastqcparser/#description

In [1]:
from pprint import pprint
from fastqcparser import FastQCParser

In [2]:
import pandas as pd
import os

#### import all fastqc results from directories

In [3]:
rootdir = 'training_data'

In [4]:
def import_reads(rootdir):
    reads = []
    for root, dirs, files in os.walk(rootdir):
        for name in files:
            filepath = root + os.sep + name
            if filepath.endswith("fastqc_data.txt"):
                reads.append(FastQCParser(filepath))
    return reads

In [5]:
reads = import_reads(rootdir)

In [6]:
len(reads)

184

#### extract module names list (for column titles)

In [7]:
module_list = list(reads[0].modules.keys())
# imported reads don't include the module 'Overrepresented sequences' 
# if there are none in the read, so we manually add the module to the list
if(len(module_list) < 11):
    module_list.insert(9, 'Overrepresented sequences')
module_list.append('Module Statuses')
module_list.append('filename')
print(module_list)

['Basic Statistics', 'Per base sequence quality', 'Per tile sequence quality', 'Per sequence quality scores', 'Per base sequence content', 'Per sequence GC content', 'Per base N content', 'Sequence Length Distribution', 'Sequence Duplication Levels', 'Overrepresented sequences', 'Adapter Content', 'Module Statuses', 'filename']


#### fill columns for one read

In [8]:
def create_single_read_dataframe(result):
    module_result = []
    module_status = []
    
    for module in result.modules:
        result_data = pd.DataFrame(result.modules[module]['data'])
        result_data.columns = result.modules[module]['fieldnames']
        module_status.append(result.modules[module]['status'])
        module_result.append(result_data)
    # imported reads don't include the module 'Overrepresented sequences' 
    # if there are none in the read, so we manually add the status list
    # and an empty dataframe /is None better?
    if len(module_status) < 11:
        module_status.insert(9, 'pass')
        module_result.insert(9, pd.DataFrame())
    module_result.append(module_status)
    module_result.append(module_result[0].Value[0].replace(".fastq.gz", ""))
    module_series = pd.Series(data=module_result, index=module_list)
    single_read = module_series.to_frame().T#.set_index('filename')
    return single_read

#### apply to all reads and create single dataframe with all results

In [9]:
read_list = []
for read in reads:
    single_read = create_single_read_dataframe(read)
    read_list.append(single_read)

In [10]:
read_results = pd.concat(read_list)

#### import metadata

In [11]:
metadata = pd.read_json('exported_datasets/metadata.json')

#### merge both datasets

In [12]:
complete_reads = pd.merge(read_results, metadata, on='filename', how='inner').set_index('filename')
complete_reads.head()

Unnamed: 0_level_0,Basic Statistics,Per base sequence quality,Per tile sequence quality,Per sequence quality scores,Per base sequence content,Per sequence GC content,Per base N content,Sequence Length Distribution,Sequence Duplication Levels,Overrepresented sequences,Adapter Content,Module Statuses,organism,technology,read_number,evaluation
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
200709_20-07968_20-00891_S21_L000_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 3.0 1 ...,Base G A T ...,GC Content Count 0 0 3.00 ...,Base N-Count 0 1 0.479607 1 ...,Length Count 0 30-39 4192.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, pass, pass, fail, pass, pass, war...",Sau,MS,2,ugly
200204_20-00746_19-03927_S9_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 13 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.009071 1 ...,Length Count 0 30-39 189.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, warn, pass, fail, pass, pass, war...",Sau,MS,1,ugly
181002-18-6991-775-18_S1_L001_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1668.0 ...,Base G A T ...,GC Content Count 0 0 1668....,Base N-Count 0 1 0.029292 1 ...,Length Count 0 30-39 4566.0 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, warn, pass, fail, warn, pass, war...",Sau,MS,1,ugly
200327_20-04028_20-00328_S25_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 12 2.0 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 30-39 8954.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Sau,MS,1,ugly
180727-18-5425-18-01680_S8_L001_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1102 ...,Quality Count 0 15 3.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 65-69 1.0 1 70...,Duplication Level Percentage of deduplicat...,...,Position Illumina Universal Adapter Illum...,"[pass, fail, fail, pass, fail, fail, pass, war...",Sau,MS,2,ugly


#### export complete dataset

In [13]:
complete_reads.to_json('exported_datasets/complete_set.json')
print('exported complete dataset')

exported complete dataset
