In [1]:
import pandas as pd
import numpy as np

In [2]:
ngs_reads = pd.read_json('exported_datasets/complete_set.json')

In [3]:
for category in ngs_reads:
    if type(ngs_reads[category][0]) is dict:
        ngs_reads[category] = ngs_reads[category].apply(pd.DataFrame)

### Prepare data for use in machine learning algorithms
extract features from nested dataframes
convert features to numeric types

In [4]:
ngs_reads.reset_index(drop=True, inplace=True)

In [5]:
# extract features from Basic Statistics

total_sequences = []
percent_gc = []
min_sequence_length = []
max_sequence_length = []
#(ngs_reads['Basic Statistics'][i]['Value'][4]) # sequences flagged as poor quality - 0 for all data points

for i in range(ngs_reads.shape[0]):
    total_sequences.append(ngs_reads['Basic Statistics'][i]['Value'][3])
    percent_gc.append(ngs_reads['Basic Statistics'][i]['Value'][6])
    length_min_max = str(ngs_reads['Basic Statistics'][i]['Value'][5]).split('-')
    min_sequence_length.append(length_min_max[0])
    max_sequence_length.append(length_min_max[-1])
total_sequences = np.asarray(total_sequences, dtype=np.int64)
percent_gc = np.asarray(percent_gc, dtype=np.int64)
min_sequence_length = np.asarray(min_sequence_length, dtype=np.int64)
max_sequence_length = np.asarray(max_sequence_length, dtype=np.int64)
ngs_reads['total_sequences'] = total_sequences
ngs_reads['percent_gc'] = percent_gc
ngs_reads['min_sequence_length'] = min_sequence_length
ngs_reads['max_sequence_length'] = max_sequence_length

ngs_reads.drop(columns='Basic Statistics', inplace=True)

In [6]:
# convert evaluation value to numeric type
ngs_reads['evaluation'].replace({'ugly': 0, 'good': 1}, inplace=True)

In [7]:
#convert statuses to numeric type
status_replacements = {'fail':0, 'warn':1, 'pass':2}

for i in range(ngs_reads.shape[0]):
    statuses_list = [status_replacements.get(n, n) for n in ngs_reads['Module Statuses'].iat[i]]
    ngs_reads['Module Statuses'].iat[i] = np.asarray(statuses_list)

In [8]:
ngs_reads

Unnamed: 0,Per base sequence quality,Per tile sequence quality,Per sequence quality scores,Per base sequence content,Per sequence GC content,Per base N content,Sequence Length Distribution,Sequence Duplication Levels,Overrepresented sequences,Adapter Content,Module Statuses,organism,technology,read_number,evaluation,total_sequences,percent_gc,min_sequence_length,max_sequence_length
0,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 3.0 1 ...,Base G A T ...,GC Content Count 0 0 3.00 ...,Base N-Count 0 1 0.479607 1 ...,Length Count 0 30-39 4192.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[2, 0, 2, 2, 0, 2, 2, 1, 2, 2, 2]",Sau,MS,2,0,511669,34,35,301
1,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 13 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.009071 1 ...,Length Count 0 30-39 189.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[2, 2, 1, 2, 0, 2, 2, 1, 2, 2, 2]",Sau,MS,1,0,220494,34,35,301
2,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1668.0 ...,Base G A T ...,GC Content Count 0 0 1668....,Base N-Count 0 1 0.029292 1 ...,Length Count 0 30-39 4566.0 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[2, 0, 1, 2, 0, 1, 2, 1, 1, 2, 2]",Sau,MS,1,0,6015314,65,35,301
3,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 12 2.0 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 30-39 8954.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[2, 2, 2, 2, 0, 2, 2, 1, 1, 2, 2]",Sau,MS,1,0,1916272,35,35,301
4,Base Mean Median Lower Quartile...,Tile Base Mean 0 1102 ...,Quality Count 0 15 3.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.0 1 ...,Length Count 0 65-69 1.0 1 70...,Duplication Level Percentage of deduplicat...,...,Position Illumina Universal Adapter Illum...,"[2, 0, 0, 2, 0, 0, 2, 1, 2, 1, 2]",Sau,MS,2,0,334,35,69,301
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 15 1.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.080016 1 ...,Length Count 0 35-39 4455.0 1 ...,Duplication Level Percentage of deduplicat...,Se...,Position Illumina Universal Adapter Illum...,"[2, 2, 2, 2, 0, 2, 2, 1, 0, 1, 2]",Efcm,HS,1,1,1367226,38,35,251
180,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1.0 1 ...,Base G A T ...,GC Content Count 0 0 3...,Base N-Count 0 1 0.000708 1 ...,Length Count 0 35-39 5799.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[2, 2, 2, 2, 0, 2, 2, 1, 0, 2, 2]",Efcm,HS,2,1,1836688,39,35,251
181,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 16 2.0 1 ...,Base G A T ...,GC Content Count 0 0 0.0 ...,Base N-Count 0 1 0.080888 1 ...,Length Count 0 35-39 1949.0 1 ...,Duplication Level Percentage of deduplicat...,Se...,Position Illumina Universal Adapter Illum...,"[2, 2, 2, 2, 0, 2, 2, 1, 0, 1, 2]",Efcm,HS,1,1,799870,37,35,251
182,Base Mean Median Lower Quartile...,Tile Base Mean 0 1101 ...,Quality Count 0 2 1.0 1 ...,Base G A T ...,GC Content Count 0 0 0...,Base N-Count 0 1 0.000365 1 ...,Length Count 0 35-39 4278.0 1 ...,Duplication Level Percentage of deduplicat...,Empty DataFrame Columns: [] Index: [],Position Illumina Universal Adapter Illum...,"[2, 2, 2, 2, 0, 2, 2, 1, 0, 2, 2]",Efcm,HS,2,1,1369977,39,35,251


In [None]:
#ngs_reads['Adapter Content'][112]

In [None]:
#pd.set_option('display.max_rows', 500)
#for i in range(ngs_reads.shape[0]):
#    print(i, ngs_reads['Per base sequence quality'][i].shape[0], ngs_reads['organism'][i], ngs_reads['technology'][i])

In [None]:
#pd.set_option('display.max_rows', 500)
#for i in range(ngs_reads.shape[0]):
#    print(i, ngs_reads['Per base N content'][i].shape[0])

In [None]:
#ngs_reads.drop(columns=['organism', 'technology', 'read_number', 'evaluation'], inplace=True)
#ngs_reads

In [10]:
grouped = ngs_reads.groupby(ngs_reads.organism)
efcm = grouped.get_group('Efcm')
sau = grouped.get_group('Sau')
ecoli = grouped.get_group('Ecoli')

In [11]:
print('exported datasets')
print('efcm -', efcm.shape)
efcm.to_json('exported_datasets/efcm.json')
print('sau -', sau.shape)
sau.to_json('exported_datasets/sau.json')
print('ecoli -', ecoli.shape)
ecoli.to_json('exported_datasets/ecoli.json')

exported datasets
efcm - (88, 19)
sau - (78, 19)
ecoli - (18, 19)
