# Data Extraction from FastQC Results

#### Import fastqQCParser   
https://pypi.org/project/fastqcparser/#description

In [22]:
from pprint import pprint
from fastqcparser import FastQCParser

In [23]:
import pandas as pd
import os

#### import all fastqc results from directories

In [24]:
rootdir = 'training_data'

In [25]:
def import_reads(rootdir):
    reads = []
    for root, dirs, files in os.walk(rootdir):
        for name in files:
            filepath = root + os.sep + name
            if filepath.endswith("fastqc_data.txt"):
                reads.append(FastQCParser(filepath))
    return reads

In [26]:
reads = import_reads(rootdir)

In [27]:
len(reads)

184

#### extract module names list (for column titles)

In [28]:
module_list = list(reads[0].modules.keys())
# imported reads don't include the module 'Overrepresented sequences' 
# if there are none in the read, so we manually add the module to the list
if(len(module_list) < 11):
    module_list.insert(9, 'Overrepresented sequences')
module_list.append('Module Statuses')
module_list.append('filename')
print(module_list)

['Basic Statistics', 'Per base sequence quality', 'Per tile sequence quality', 'Per sequence quality scores', 'Per base sequence content', 'Per sequence GC content', 'Per base N content', 'Sequence Length Distribution', 'Sequence Duplication Levels', 'Overrepresented sequences', 'Adapter Content', 'Module Statuses', 'filename']


#### fill columns for one read

In [29]:
def create_single_read_dataframe(result):
    module_result = []
    module_status = []
    
    for module in result.modules:
        result_data = pd.DataFrame(result.modules[module]['data'])
        result_data.columns = result.modules[module]['fieldnames']
        module_status.append(result.modules[module]['status'])
        module_result.append(result_data)
    # imported reads don't include the module 'Overrepresented sequences' 
    # if there are none in the read, so we manually add the status list
    # and an empty dataframe /is None better?
    if len(module_status) < 11:
        module_status.insert(9, 'pass')
        module_result.insert(9, pd.DataFrame())
    module_result.append(module_status)
    module_result.append(module_result[0].Value[0].replace(".fastq.gz", ""))
    module_series = pd.Series(data=module_result, index=module_list)
    single_read = module_series.to_frame().T#.set_index('filename')
    return single_read

#### apply to all reads and create single dataframe with all results

In [30]:
read_list = []
for read in reads:
    single_read = create_single_read_dataframe(read)
    read_list.append(single_read)

In [31]:
read_results = pd.concat(read_list)

In [36]:
type(read_results['Basic Statistics'])

pandas.core.series.Series

#### import metadata

In [11]:
metadata = pd.read_json('exported_datasets/metadata.json')

#### merge both datasets

In [12]:
complete_reads = pd.merge(read_results, metadata, on='filename', how='inner').set_index('filename')
complete_reads.head()

Unnamed: 0_level_0,Basic Statistics,Per base sequence quality,Per tile sequence quality,Per sequence quality scores,Per base sequence content,Per sequence GC content,Per base N content,Sequence Length Distribution,Sequence Duplication Levels,Overrepresented sequences,Adapter Content,Module Statuses,organism,technology,read_number,evaluation
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
200709_20-07968_20-00891_S21_L000_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 3.0 1 ...,Base G A T ...,GC Content Count 0 0 3.00 ...,Base N-Count 0 1 0.479607 1 ...,Length Count 0 30-39 4192.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, pass, pass, fail, pass, pass, war...",Sau,MS,2,ugly
200204_20-00746_19-03927_S9_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 13 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.009071 1 ...,Length Count 0 30-39 189.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, warn, pass, fail, pass, pass, war...",Sau,MS,1,ugly
181002-18-6991-775-18_S1_L001_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1668.0 ...,Base G A T ...,GC Content Count 0 0 1668....,Base N-Count 0 1 0.029292 1 ...,Length Count 0 30-39 4566.0 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, warn, pass, fail, warn, pass, war...",Sau,MS,1,ugly
200327_20-04028_20-00328_S25_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 12 2.0 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 30-39 8954.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Sau,MS,1,ugly
180727-18-5425-18-01680_S8_L001_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1102 ...,Quality Count 0 15 3.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 65-69 1.0 1 70...,Duplication Level Percentage of deduplicat...,...,Position Illumina Universal Adapter Illum...,"[pass, fail, fail, pass, fail, fail, pass, war...",Sau,MS,2,ugly


#### import positional data from raw fastq dataset

In [13]:
position_data = pd.read_json('exported_datasets/from_fastq_raw.json')

In [14]:
position_data.set_index('filename', inplace=True)

#### add positional dataset to complete dataset

In [15]:
complete_reads = complete_reads.join(position_data)

In [16]:
complete_reads.head()

Unnamed: 0_level_0,Basic Statistics,Per base sequence quality,Per tile sequence quality,Per sequence quality scores,Per base sequence content,Per sequence GC content,Per base N content,Sequence Length Distribution,Sequence Duplication Levels,Overrepresented sequences,...,technology,read_number,evaluation,G,C,A,T,N,phred_means,n_content
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
200709_20-07968_20-00891_S21_L000_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 3.0 1 ...,Base G A T ...,GC Content Count 0 0 3.00 ...,Base N-Count 0 1 0.479607 1 ...,Length Count 0 30-39 4192.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],...,MS,2,ugly,"[0.3341613426, 0.11678448370000001, 0.13434075...","[0.2022772535, 0.1802278426, 0.174083245200000...","[0.29404751900000003, 0.2354940401, 0.32372099...","[0.1647178156, 0.46721415600000005, 0.36769669...","[0.0047960693000000006, 0.0002794776, 0.000158...","[33.5481121584, 33.7296865747, 33.8020321731, ...",0.000191
200204_20-00746_19-03927_S9_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 13 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.009071 1 ...,Length Count 0 30-39 189.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],...,MS,1,ugly,"[0.3211153138, 0.11626166700000001, 0.13734614...","[0.1975563961, 0.17868966960000002, 0.16980507...","[0.31658457830000003, 0.2612497392, 0.34341977...","[0.1646530064, 0.4437989242, 0.3494290094, 0.4...","[9.070540000000001e-05, 0.0, 0.0, 0.0, 0.0, 0....","[32.3463314194, 32.1643037906, 33.0254428692, ...",0.000584
181002-18-6991-775-18_S1_L001_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1668.0 ...,Base G A T ...,GC Content Count 0 0 1668....,Base N-Count 0 1 0.029292 1 ...,Length Count 0 30-39 4566.0 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],...,MS,1,ugly,"[0.4630365098, 0.2234009729, 0.2668379406, 0.2...","[0.2583053852, 0.3056849568, 0.325215608000000...","[0.1668215491, 0.15523545400000002, 0.18280691...","[0.1115436368, 0.31540132400000004, 0.22486224...","[0.000292919, 0.0002772923, 0.0002772923, 0.00...","[33.7891855687, 33.7783872962, 33.7909218039, ...",0.000187
200327_20-04028_20-00328_S25_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 12 2.0 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 30-39 8954.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],...,MS,1,ugly,"[0.3209387811, 0.1170094851, 0.1332436105, 0.0...","[0.2183526138, 0.1876742968, 0.185322856, 0.19...","[0.29339415280000003, 0.236934527, 0.324150746...","[0.16731445220000002, 0.45838169110000004, 0.3...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[33.4176207762, 33.5015081366, 33.5445255162, ...",1.7e-05
180727-18-5425-18-01680_S8_L001_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1102 ...,Quality Count 0 15 3.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 65-69 1.0 1 70...,Duplication Level Percentage of deduplicat...,...,...,MS,2,ugly,"[0.37425149700000004, 0.0928143713, 0.11976047...","[0.2005988024, 0.1706586826, 0.1706586826, 0.1...","[0.2904191617, 0.2305389222, 0.3592814371, 0.1...","[0.1347305389, 0.505988024, 0.3502994012, 0.51...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[32.7934131737, 33.3113772455, 33.2784431138, ...",0.0


#### export complete dataset

In [17]:
complete_reads.to_json('exported_datasets/complete_set.json')
print('exported complete dataset')

exported complete dataset
