## Prepare data for use in machine learning algorithms
* extract features from nested dataframes  
* convert features to numeric types

In [1]:
import pandas as pd
import numpy as np

#### import dataset

In [2]:
ngs_reads = pd.read_json('exported_datasets/complete_set.json')

In [3]:
# convert dict type columns to dataframes
for category in ngs_reads:
    if type(ngs_reads[category][0]) is dict:
        ngs_reads[category] = ngs_reads[category].apply(pd.DataFrame)

In [4]:
ngs_reads.reset_index(drop=True, inplace=True)

### Basic Statistics
#### extract features from Basic Statistics

In [5]:
ngs_reads['Basic Statistics'][0]

Unnamed: 0,Measure,Value
0,Filename,200709_20-07968_20-00891_S21_L000_R2_001.fastq.gz
1,File type,Conventional base calls
2,Encoding,Sanger / Illumina 1.9
3,Total Sequences,511669
4,Sequences flagged as poor quality,0
5,Sequence length,35-301
6,%GC,34


In [6]:
# extract features from Basic Statistics

total_sequences = []
percent_gc = []
min_sequence_length = []
max_sequence_length = []
#(ngs_reads['Basic Statistics'][i]['Value'][4]) # sequences flagged as poor quality - 0 for all data points

for i in range(ngs_reads.shape[0]):
    total_sequences.append(ngs_reads['Basic Statistics'][i]['Value'][3])
    percent_gc.append(ngs_reads['Basic Statistics'][i]['Value'][6])
    length_min_max = str(ngs_reads['Basic Statistics'][i]['Value'][5]).split('-')
    min_sequence_length.append(length_min_max[0])
    max_sequence_length.append(length_min_max[-1])
total_sequences = np.asarray(total_sequences, dtype=np.int64)
percent_gc = np.asarray(percent_gc, dtype=np.int64)
min_sequence_length = np.asarray(min_sequence_length, dtype=np.int64)
max_sequence_length = np.asarray(max_sequence_length, dtype=np.int64)
ngs_reads['total_sequences'] = total_sequences
ngs_reads['percent_gc'] = percent_gc
ngs_reads['min_sequence_length'] = min_sequence_length
ngs_reads['max_sequence_length'] = max_sequence_length

ngs_reads.drop(columns='Basic Statistics', inplace=True)

### Evaluation (target variable)
#### convert evaluation value to numeric type

In [7]:
ngs_reads['evaluation'].replace({'ugly': 0, 'good': 1}, inplace=True)

### Modul Statuses
#### convert statuses to numeric type

In [8]:
status_replacements = {'fail':0, 'warn':1, 'pass':2}

for i in range(ngs_reads.shape[0]):
    statuses_list = [status_replacements.get(n, n) for n in ngs_reads['Module Statuses'].iat[i]]
    ngs_reads['Module Statuses'].iat[i] = statuses_list

#### add inner array data from Module Statuses as df columns

In [9]:
def create_status_column_name(module_nr):
    return 'module_' + str(module_nr) + '_status'

column_names_status = list(map(create_status_column_name, range(len(ngs_reads['Module Statuses'][0]))))

statuses_df = pd.DataFrame(ngs_reads['Module Statuses'].to_list(), columns=column_names_status)
ngs_reads = ngs_reads.join(statuses_df)
ngs_reads.drop(columns='Module Statuses', inplace=True)

### (Todo: encode technology)

#### remove status of module 0  
Module 0 is Basic Statistics and always has the same value (pass/2)

In [10]:
# number of unique values in column module_0_status
len(ngs_reads['module_0_status'].unique())

1

In [11]:
ngs_reads.drop('module_0_status', axis=1, inplace=True)

#### export complete prepared dataset

In [12]:
print('export full dataset')
print('shape -', ngs_reads.shape)
ngs_reads.to_json('exported_datasets/prepared_dataset.json')

export full dataset
shape - (184, 28)


#### remove module columns for export to to simple organism datasets

In [13]:
cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
ngs_reads_simple = ngs_reads.drop(ngs_reads.columns[cols], axis=1)
ngs_reads_simple.head()

Unnamed: 0,organism,technology,read_number,evaluation,total_sequences,percent_gc,min_sequence_length,max_sequence_length,module_1_status,module_2_status,module_3_status,module_4_status,module_5_status,module_6_status,module_7_status,module_8_status,module_9_status,module_10_status
0,Sau,MS,2,0,511669,34,35,301,0,2,2,0,2,2,1,2,2,2
1,Sau,MS,1,0,220494,34,35,301,2,1,2,0,2,2,1,2,2,2
2,Sau,MS,1,0,6015314,65,35,301,0,1,2,0,1,2,1,1,2,2
3,Sau,MS,1,0,1916272,35,35,301,2,2,2,0,2,2,1,1,2,2
4,Sau,MS,2,0,334,35,69,301,0,0,2,0,0,2,1,2,1,2


#### split dataset by organism and export dataset for each

In [14]:
grouped = ngs_reads_simple.groupby(ngs_reads.organism)
efcm = grouped.get_group('Efcm')
efcm.reset_index(drop=True)
sau = grouped.get_group('Sau')
sau.reset_index(drop=True)
ecoli = grouped.get_group('Ecoli')
ecoli.reset_index(drop=True)

Unnamed: 0,organism,technology,read_number,evaluation,total_sequences,percent_gc,min_sequence_length,max_sequence_length,module_1_status,module_2_status,module_3_status,module_4_status,module_5_status,module_6_status,module_7_status,module_8_status,module_9_status,module_10_status
0,Ecoli,MS,2,0,23236,51,35,301,0,0,2,0,1,2,1,2,1,2
1,Ecoli,MS,2,0,949,51,35,301,0,0,0,0,1,2,1,2,1,2
2,Ecoli,MS,1,0,866227,51,35,301,0,2,2,1,1,2,1,2,2,2
3,Ecoli,MS,2,0,378359,50,35,301,0,2,2,1,1,2,1,2,2,2
4,Ecoli,MS,1,0,806181,51,35,301,0,2,2,1,1,2,1,2,2,2
5,Ecoli,MS,1,0,23236,51,35,301,2,1,2,0,1,2,1,2,1,2
6,Ecoli,MS,2,0,806181,51,35,301,0,1,2,1,1,2,1,2,2,2
7,Ecoli,MS,1,0,949,51,35,301,1,0,2,0,0,2,1,2,1,2
8,Ecoli,MS,1,0,378359,50,35,301,0,1,2,1,0,2,1,2,2,2
9,Ecoli,MS,2,0,866227,51,35,301,0,2,2,1,1,2,1,2,2,2


In [15]:
ngs_reads_simple

Unnamed: 0,organism,technology,read_number,evaluation,total_sequences,percent_gc,min_sequence_length,max_sequence_length,module_1_status,module_2_status,module_3_status,module_4_status,module_5_status,module_6_status,module_7_status,module_8_status,module_9_status,module_10_status
0,Sau,MS,2,0,511669,34,35,301,0,2,2,0,2,2,1,2,2,2
1,Sau,MS,1,0,220494,34,35,301,2,1,2,0,2,2,1,2,2,2
2,Sau,MS,1,0,6015314,65,35,301,0,1,2,0,1,2,1,1,2,2
3,Sau,MS,1,0,1916272,35,35,301,2,2,2,0,2,2,1,1,2,2
4,Sau,MS,2,0,334,35,69,301,0,0,2,0,0,2,1,2,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179,Efcm,HS,1,1,1367226,38,35,251,2,2,2,0,2,2,1,0,1,2
180,Efcm,HS,2,1,1836688,39,35,251,2,2,2,0,2,2,1,0,2,2
181,Efcm,HS,1,1,799870,37,35,251,2,2,2,0,2,2,1,0,1,2
182,Efcm,HS,2,1,1369977,39,35,251,2,2,2,0,2,2,1,0,2,2


In [16]:
print('exported datasets')
print('all -', ngs_reads_simple.shape)
ngs_reads_simple.to_json('exported_datasets/all_simple.json')
print('efcm -', efcm.shape)
efcm.to_json('exported_datasets/efcm_simple.json')
print('sau -', sau.shape)
sau.to_json('exported_datasets/sau_simple.json')
print('ecoli -', ecoli.shape)
ecoli.to_json('exported_datasets/ecoli_simple.json')

exported datasets
all - (184, 18)
efcm - (88, 18)
sau - (78, 18)
ecoli - (18, 18)
