## Prepare data for use in machine learning algorithms
* extract features from nested dataframes  
* convert features to numeric types

In [1]:
import pandas as pd
import numpy as np

#### import dataset

In [2]:
ngs_reads = pd.read_json('exported_datasets/complete_set.json')

In [3]:
# convert dict type columns to dataframes
for category in ngs_reads:
    if type(ngs_reads[category][0]) is dict:
        ngs_reads[category] = ngs_reads[category].apply(pd.DataFrame)

In [4]:
ngs_reads.reset_index(drop=True, inplace=True)

### Basic Statistics
#### extract features from Basic Statistics

In [5]:
ngs_reads['Basic Statistics'][0]

Unnamed: 0,Measure,Value
0,Filename,200709_20-07968_20-00891_S21_L000_R2_001.fastq.gz
1,File type,Conventional base calls
2,Encoding,Sanger / Illumina 1.9
3,Total Sequences,511669
4,Sequences flagged as poor quality,0
5,Sequence length,35-301
6,%GC,34


In [6]:
# extract features from Basic Statistics

total_sequences = []
percent_gc = []
min_sequence_length = []
max_sequence_length = []
#(ngs_reads['Basic Statistics'][i]['Value'][4]) # sequences flagged as poor quality - 0 for all data points

for i in range(ngs_reads.shape[0]):
    total_sequences.append(ngs_reads['Basic Statistics'][i]['Value'][3])
    percent_gc.append(ngs_reads['Basic Statistics'][i]['Value'][6])
    length_min_max = str(ngs_reads['Basic Statistics'][i]['Value'][5]).split('-')
    min_sequence_length.append(length_min_max[0])
    max_sequence_length.append(length_min_max[-1])
total_sequences = np.asarray(total_sequences, dtype=np.int64)
percent_gc = np.asarray(percent_gc, dtype=np.int64)
min_sequence_length = np.asarray(min_sequence_length, dtype=np.int64)
max_sequence_length = np.asarray(max_sequence_length, dtype=np.int64)
ngs_reads['total_sequences'] = total_sequences
ngs_reads['percent_gc'] = percent_gc
ngs_reads['min_sequence_length'] = min_sequence_length
ngs_reads['max_sequence_length'] = max_sequence_length

ngs_reads.drop(columns='Basic Statistics', inplace=True)

### Evaluation (target variable)
#### convert evaluation value to numeric type

In [7]:
ngs_reads['evaluation'].replace({'ugly': 0, 'good': 1}, inplace=True)

### Modul Statuses
#### convert statuses to numeric type

status_replacements = {'fail':0, 'warn':1, 'pass':2}

for i in range(ngs_reads.shape[0]):
    statuses_list = [status_replacements.get(n, n) for n in ngs_reads['Module Statuses'].iat[i]]
    ngs_reads['Module Statuses'].iat[i] = statuses_list

#### add inner array data from Module Statuses as df columns

In [8]:
def create_status_column_name(module_nr):
    return 'status_module_' + str(module_nr)

column_names_status = list(map(create_status_column_name, range(len(ngs_reads['Module Statuses'][0]))))

statuses_df = pd.DataFrame(ngs_reads['Module Statuses'].to_list(), columns=column_names_status)
ngs_reads = ngs_reads.join(statuses_df)
ngs_reads.drop(columns='Module Statuses', inplace=True)

#### show transformed columns

In [9]:
cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
ngs_reads.drop(ngs_reads.columns[cols], axis=1).head()

Unnamed: 0,evaluation,total_sequences,percent_gc,min_sequence_length,max_sequence_length,status_module_0,status_module_1,status_module_2,status_module_3,status_module_4,status_module_5,status_module_6,status_module_7,status_module_8,status_module_9,status_module_10
0,0,511669,34,35,301,pass,fail,pass,pass,fail,pass,pass,warn,pass,pass,pass
1,0,220494,34,35,301,pass,pass,warn,pass,fail,pass,pass,warn,pass,pass,pass
2,0,6015314,65,35,301,pass,fail,warn,pass,fail,warn,pass,warn,warn,pass,pass
3,0,1916272,35,35,301,pass,pass,pass,pass,fail,pass,pass,warn,warn,pass,pass
4,0,334,35,69,301,pass,fail,fail,pass,fail,fail,pass,warn,pass,warn,pass


#### split dataset by organism and export dataset for each

In [10]:
grouped = ngs_reads.groupby(ngs_reads.organism)
efcm = grouped.get_group('Efcm')
sau = grouped.get_group('Sau')
ecoli = grouped.get_group('Ecoli')

In [11]:
print('exported datasets')
print('efcm -', efcm.shape)
efcm.to_json('exported_datasets/efcm.json')
print('sau -', sau.shape)
sau.to_json('exported_datasets/sau.json')
print('ecoli -', ecoli.shape)
ecoli.to_json('exported_datasets/ecoli.json')

exported datasets
efcm - (88, 29)
sau - (78, 29)
ecoli - (18, 29)
