### Imports

In [1]:
import logging
import pandas as pd
import re

In [2]:
logging.basicConfig(level=logging.DEBUG)

### Load Data

In [3]:
def load_fasta_data(input_dir: str, filename:str) -> str:
    with open(f'{input_dir}{filename}') as f:
        data_file  = f.read()
    logging.debug(f'{input_dir}{filename} is read in.')
    return data_file

In [4]:
def _split_string(raw_data: str) -> list:
    _data = raw_data.split(">")
    _data = _data[1:]
    _data_list = [x.split(",") for x in _data]
    return _data_list

In [5]:
#Not something done by me:
def _clean_id_names(text):
    
    id_and_names = text.split(' ',1)
    
    return id_and_names

def _clean_text(text):
    remove_lower = lambda text: re.sub('[a-z]', '', text)
    text = remove_lower(text)
    text = text.strip()
    text = text.replace('\n', '')
    return text

In [6]:
def preprocess_data(data_file: str) -> (list, list, list):
    logging.debug(f'...Pre-processing data...')
    _data_list = _split_string(data_file)
    
    _id_name = []
    _genome_sequence = []
    _description = []
    _ID = []
    _Description = []
    _clean_sequences = []

    for item in _data_list:
        _id_name.append(item[0])
        _genome_sequence.append(item[1])
    
    for item in _id_name:
        _description.append(_clean_id_names(item))
    
    for item in _description:
        _ID.append(item[0])
        _Description.append(item[1])
    
    for sequence in _genome_sequence:
        _clean_sequences.append(_clean_text(sequence))
    logging.debug(f'---Pre-processing complete---')
    return _ID, _Description, _clean_sequences

In [7]:
def listmaker(label: int, length: int) -> list:
    listofzeros = [label] * length
    return listofzeros

In [38]:
def create_dataframe(list_of_columns: list, column_headers: list) -> pd.DataFrame:
    logging.debug(f'Creating dataframe from list of columns.')
    
    list_of_lists = [list(x) for x in zip(list_of_columns)]
    
    logging.debug(f'The length of list_of_lists is: {len(list_of_lists)}.')
    logging.debug(f'The length of column_headers is: {len(column_headers)}.')
    logging.debug(f'The list of column headers looks like: {column_headers}')
    logging.debug(f'The actual data currently looks like: {list_of_lists} ')
    
    dataframe = pd.DataFrame(list_of_lists, columns=column_headers)
    return dataframe

In [39]:
def create_training_data(training_files: list) -> pd.DataFrame:
    logging.debug(f'Creating training dataframe.')
    training_dataframe = pd.DataFrame()
    
    for index in range(len(training_files)):
        logging.debug(f'Processing file at index: {index}')
        data_file = training_files[index]
        
        _ID, _Description, _clean_sequences = preprocess_data(data_file)
        
        label = index
        
        labels = listmaker(label, len(_clean_sequences))

        list_of_columns = [_ID, _Description, _clean_sequences, labels]
        column_headers = ['ID' , 'Description', 'Sequences', 'Labels']

        dataframe = create_dataframe(list_of_columns, column_headers)
        training_dataframe = pd.concat([training_dataframe, dataframe])
    
    return training_dataframe

In [40]:
def create_testing_data(data_file: str) -> pd.DataFrame:
    logging.debug(f'Creating testing dataframe.')
    testing_dataframe = pd.DataFrame()
    
    _ID, _Description, _clean_sequences = preprocess_data(data_file)

    list_of_columns = [_ID, _Description, _clean_sequences]
    column_headers = ['ID' , 'Description', 'Sequences']

    testing_dataframe = create_dataframe(list_of_columns, column_headers)
    
    return testing_dataframe

In [41]:
def write_dataframe_to_disk(dataframe: pd.DataFrame, output_dir: str, output_filename: str):
    dataframe.to_csv(f'{output_dir}{output_filename}.csv', index=False)
    logging.debug(f'File {output_dir}{output_filename}.csv written to disk.')

In [42]:
def main():
    input_dir = '../data/01_raw/'
    non_zoonotic_filename = "NonZoonoticVirusesTrain.fasta"
    zoonotic_filename = "ZoonoticVirusesTrain.fasta"
    test_data_filename = "VirusesTestInput.fasta"
    
    output_dir = '../data/04_features/'
    
    non_zoonotic_data_file = load_fasta_data(input_dir, non_zoonotic_filename)
    zoonotic_data_file = load_fasta_data(input_dir, zoonotic_filename)
    test_data_file = load_fasta_data(input_dir, test_data_filename)
    
    training_files = [zoonotic_data_file, non_zoonotic_data_file]
    
    training_dataframe = create_training_data(training_files)
    testing_dataframe = create_testing_data(test_data_file)
    write_dataframe_to_disk(training_dataframe, output_dir, output_filename = 'zoo_sequence_training_data')
    write_dataframe_to_disk(testing_dataframe, output_dir, output_filename = 'zoo_sequence_testing_data')

In [43]:
if __name__ == "__main__":
    main()

DEBUG:root:../data/01_raw/NonZoonoticVirusesTrain.fasta is read in.
DEBUG:root:../data/01_raw/ZoonoticVirusesTrain.fasta is read in.
DEBUG:root:../data/01_raw/VirusesTestInput.fasta is read in.
DEBUG:root:Creating training dataframe.
DEBUG:root:Processing file at index: 0
DEBUG:root:...Pre-processing data...
DEBUG:root:---Pre-processing complete---
DEBUG:root:Creating dataframe from list of columns.
DEBUG:root:The length of list_of_lists is: 4.
DEBUG:root:The length of column_headers is: 4.
DEBUG:root:The list of column headers looks like: ['ID', 'Description', 'Sequences', 'Labels']
DEBUG:root:The actual data currently looks like: [[['NC_003466.1', 'NC_003468.2', 'NC_003467.2', 'NC_009026.2', 'NC_004211.1', 'NC_004217.1', 'NC_004218.1', 'NC_004219.1', 'NC_004220.1', 'NC_004221.1', 'NC_004198.1', 'NC_004202.1', 'NC_004203.1', 'NC_004204.1', 'NC_004201.1', 'NC_033693.1', 'NC_005337.1', 'NC_001926.1', 'NC_001925.1', 'NC_001921.1', 'NC_034489.1', 'NC_034477.1', 'NC_034478.1', 'NC_004812.1

ValueError: 4 columns passed, passed data had 1 columns