# Machine Learning Evaluation of NGS Fastqc Results

## Import fastqc results into a dataframe

# Import fastqQCParser   
https://pypi.org/project/fastqcparser/#description

In [1]:
from pprint import pprint
from fastqcparser import FastQCParser

In [2]:
import pandas as pd
import os

In [3]:
rootdir = 'training_data' #os.path.abspath(os.curdir)

In [4]:
def import_reads(rootdir):
    reads = []
    for root, dirs, files in os.walk(rootdir):
        for name in files:
            filepath = root + os.sep + name
            if filepath.endswith("fastqc_data.txt"):
                reads.append(FastQCParser(filepath))
    return reads

In [5]:
reads = import_reads(rootdir)

In [6]:
len(reads)

184

In [7]:
module_list = list(reads[0].modules.keys())
# imported reads don't include the module 'Overrepresented sequences' 
# if there are none in the read, so we manually add the module to the list
if(len(module_list) < 11):
    module_list.insert(9, 'Overrepresented sequences')
module_list.append('Module Statuses')
module_list.append('filename')
print(module_list)

['Basic Statistics', 'Per base sequence quality', 'Per tile sequence quality', 'Per sequence quality scores', 'Per base sequence content', 'Per sequence GC content', 'Per base N content', 'Sequence Length Distribution', 'Sequence Duplication Levels', 'Overrepresented sequences', 'Adapter Content', 'Module Statuses', 'filename']


In [8]:
def create_single_read_dataframe(result):
    module_result = []
    module_status = []
    
    for module in result.modules:
        result_data = pd.DataFrame(result.modules[module]['data'])
        result_data.columns = result.modules[module]['fieldnames']
        module_status.append(result.modules[module]['status'])
        module_result.append(result_data)
    # imported reads don't include the module 'Overrepresented sequences' 
    # if there are none in the read, so we manually add the status list
    # and an empty dataframe /is None better?
    if len(module_status) < 11:
        module_status.insert(9, 'pass')
        module_result.insert(9, pd.DataFrame())
    module_result.append(module_status)
    module_result.append(module_result[0].Value[0].replace(".fastq.gz", ""))
    #print(len(module_result), ' - ',  module_result[0].Value[0], '-', len(module_list) )
    module_series = pd.Series(data=module_result, index=module_list)
    single_read = module_series.to_frame().T#.set_index('filename')
    return single_read

In [9]:
read_list = []
for read in reads:
    single_read = create_single_read_dataframe(read)
    read_list.append(single_read)

In [10]:
read_results = pd.concat(read_list)
read_results

Unnamed: 0,Basic Statistics,Per base sequence quality,Per tile sequence quality,Per sequence quality scores,Per base sequence content,Per sequence GC content,Per base N content,Sequence Length Distribution,Sequence Duplication Levels,Overrepresented sequences,Adapter Content,Module Statuses,filename
0,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 3.0 1 ...,Base G A T ...,GC Content Count 0 0 3.00 ...,Base N-Count 0 1 0.479607 1 ...,Length Count 0 30-39 4192.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, pass, pass, fail, pass, pass, war...",200709_20-07968_20-00891_S21_L000_R2_001
0,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 13 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.009071 1 ...,Length Count 0 30-39 189.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, warn, pass, fail, pass, pass, war...",200204_20-00746_19-03927_S9_L000_R1_001
0,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1668.0 ...,Base G A T ...,GC Content Count 0 0 1668....,Base N-Count 0 1 0.029292 1 ...,Length Count 0 30-39 4566.0 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, warn, pass, fail, warn, pass, war...",181002-18-6991-775-18_S1_L001_R1_001
0,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 12 2.0 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 30-39 8954.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",200327_20-04028_20-00328_S25_L000_R1_001
0,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1102 ...,Quality Count 0 15 3.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 65-69 1.0 1 70...,Duplication Level Percentage of deduplicat...,...,Position Illumina Universal Adapter Illum...,"[pass, fail, fail, pass, fail, fail, pass, war...",180727-18-5425-18-01680_S8_L001_R2_001
...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 15 1.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.080016 1 ...,Length Count 0 35-39 4455.0 1 ...,Duplication Level Percentage of deduplicat...,Se...,Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",191113_19-10152_20476_S134_L000_R1_001
0,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1.0 1 ...,Base G A T ...,GC Content Count 0 0 3...,Base N-Count 0 1 0.000708 1 ...,Length Count 0 35-39 5799.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",191113_19-10150_20474_S132_L000_R2_001
0,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 16 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.080888 1 ...,Length Count 0 35-39 1949.0 1 ...,Duplication Level Percentage of deduplicat...,Se...,Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",191113_19-10153_20477_S135_L000_R1_001
0,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1.0 1 ...,Base G A T ...,GC Content Count 0 0 0...,Base N-Count 0 1 0.000365 1 ...,Length Count 0 35-39 4278.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",191113_19-10151_20475_S133_L000_R2_001


In [11]:
pd.set_option('display.max_rows', 10)

In [12]:
metadata = pd.read_json('exported_datasets/metadata.json')
metadata

Unnamed: 0,filename,organism,technology,read_number,evaluation
0,200709_20-07968_20-00891_S21_L000_R2_001,Sau,MS,2,ugly
1,200204_20-00746_19-03927_S9_L000_R1_001,Sau,MS,1,ugly
2,181002-18-6991-775-18_S1_L001_R1_001,Sau,MS,1,ugly
3,200327_20-04028_20-00328_S25_L000_R1_001,Sau,MS,1,ugly
4,180727-18-5425-18-01680_S8_L001_R2_001,Sau,MS,2,ugly
...,...,...,...,...,...
179,191113_19-10152_20476_S134_L000_R1_001,Efcm,HS,1,good
180,191113_19-10150_20474_S132_L000_R2_001,Efcm,HS,2,good
181,191113_19-10153_20477_S135_L000_R1_001,Efcm,HS,1,good
182,191113_19-10151_20475_S133_L000_R2_001,Efcm,HS,2,good


In [13]:
complete_reads = pd.merge(read_results, metadata, on='filename', how='inner').set_index('filename')
complete_reads

Unnamed: 0_level_0,Basic Statistics,Per base sequence quality,Per tile sequence quality,Per sequence quality scores,Per base sequence content,Per sequence GC content,Per base N content,Sequence Length Distribution,Sequence Duplication Levels,Overrepresented sequences,Adapter Content,Module Statuses,organism,technology,read_number,evaluation
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
200709_20-07968_20-00891_S21_L000_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 3.0 1 ...,Base G A T ...,GC Content Count 0 0 3.00 ...,Base N-Count 0 1 0.479607 1 ...,Length Count 0 30-39 4192.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, pass, pass, fail, pass, pass, war...",Sau,MS,2,ugly
200204_20-00746_19-03927_S9_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 13 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.009071 1 ...,Length Count 0 30-39 189.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, warn, pass, fail, pass, pass, war...",Sau,MS,1,ugly
181002-18-6991-775-18_S1_L001_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1668.0 ...,Base G A T ...,GC Content Count 0 0 1668....,Base N-Count 0 1 0.029292 1 ...,Length Count 0 30-39 4566.0 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, warn, pass, fail, warn, pass, war...",Sau,MS,1,ugly
200327_20-04028_20-00328_S25_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 12 2.0 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 30-39 8954.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Sau,MS,1,ugly
180727-18-5425-18-01680_S8_L001_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1102 ...,Quality Count 0 15 3.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 65-69 1.0 1 70...,Duplication Level Percentage of deduplicat...,...,Position Illumina Universal Adapter Illum...,"[pass, fail, fail, pass, fail, fail, pass, war...",Sau,MS,2,ugly
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191113_19-10152_20476_S134_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 15 1.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.080016 1 ...,Length Count 0 35-39 4455.0 1 ...,Duplication Level Percentage of deduplicat...,Se...,Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Efcm,HS,1,good
191113_19-10150_20474_S132_L000_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1.0 1 ...,Base G A T ...,GC Content Count 0 0 3...,Base N-Count 0 1 0.000708 1 ...,Length Count 0 35-39 5799.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Efcm,HS,2,good
191113_19-10153_20477_S135_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 16 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.080888 1 ...,Length Count 0 35-39 1949.0 1 ...,Duplication Level Percentage of deduplicat...,Se...,Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Efcm,HS,1,good
191113_19-10151_20475_S133_L000_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1.0 1 ...,Base G A T ...,GC Content Count 0 0 0...,Base N-Count 0 1 0.000365 1 ...,Length Count 0 35-39 4278.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Efcm,HS,2,good


In [14]:
complete_reads['evaluation'].replace({'ugly': 0, 'good': 1}, inplace=True)
complete_reads

Unnamed: 0_level_0,Basic Statistics,Per base sequence quality,Per tile sequence quality,Per sequence quality scores,Per base sequence content,Per sequence GC content,Per base N content,Sequence Length Distribution,Sequence Duplication Levels,Overrepresented sequences,Adapter Content,Module Statuses,organism,technology,read_number,evaluation
filename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
200709_20-07968_20-00891_S21_L000_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 3.0 1 ...,Base G A T ...,GC Content Count 0 0 3.00 ...,Base N-Count 0 1 0.479607 1 ...,Length Count 0 30-39 4192.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, pass, pass, fail, pass, pass, war...",Sau,MS,2,0
200204_20-00746_19-03927_S9_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 13 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.009071 1 ...,Length Count 0 30-39 189.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, warn, pass, fail, pass, pass, war...",Sau,MS,1,0
181002-18-6991-775-18_S1_L001_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1668.0 ...,Base G A T ...,GC Content Count 0 0 1668....,Base N-Count 0 1 0.029292 1 ...,Length Count 0 30-39 4566.0 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, fail, warn, pass, fail, warn, pass, war...",Sau,MS,1,0
200327_20-04028_20-00328_S25_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 12 2.0 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 30-39 8954.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Sau,MS,1,0
180727-18-5425-18-01680_S8_L001_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1102 ...,Quality Count 0 15 3.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 65-69 1.0 1 70...,Duplication Level Percentage of deduplicat...,...,Position Illumina Universal Adapter Illum...,"[pass, fail, fail, pass, fail, fail, pass, war...",Sau,MS,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191113_19-10152_20476_S134_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 15 1.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.080016 1 ...,Length Count 0 35-39 4455.0 1 ...,Duplication Level Percentage of deduplicat...,Se...,Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Efcm,HS,1,1
191113_19-10150_20474_S132_L000_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1.0 1 ...,Base G A T ...,GC Content Count 0 0 3...,Base N-Count 0 1 0.000708 1 ...,Length Count 0 35-39 5799.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Efcm,HS,2,1
191113_19-10153_20477_S135_L000_R1_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 16 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.080888 1 ...,Length Count 0 35-39 1949.0 1 ...,Duplication Level Percentage of deduplicat...,Se...,Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Efcm,HS,1,1
191113_19-10151_20475_S133_L000_R2_001,Measure \ 0 ...,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1.0 1 ...,Base G A T ...,GC Content Count 0 0 0...,Base N-Count 0 1 0.000365 1 ...,Length Count 0 35-39 4278.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[pass, pass, pass, pass, fail, pass, pass, war...",Efcm,HS,2,1


In [15]:
grouped = complete_reads.groupby(complete_reads.organism)
efcm = grouped.get_group('Efcm')
sau = grouped.get_group('Sau')
ecoli = grouped.get_group('Ecoli')

In [16]:
print('exported datasets')
print('efcm -', efcm.shape)
efcm.to_json('exported_datasets/efcm.json')
print('sau -', sau.shape)
sau.to_json('exported_datasets/sau.json')
print('ecoli -', ecoli.shape)
ecoli.to_json('exported_datasets/ecoli.json')

exported datasets
efcm - (88, 16)
sau - (78, 16)
ecoli - (18, 16)
