# The alfie module


## Evaluate sequences with alfie's pre-built kingdom level classifier


In [34]:
from alfie.alf import classify_records
#fasta/fastq input and output 
from alfie.seqio import read_fasta, read_fastq, write_fasta, write_fastq

### Read in data 

In [39]:
from alfie import ex_fastq_file
example_fastq = read_fastq(ex_fastq_file)

#or access directly
#from alfie import example_fastq

example_fastq[0]

{'name': '@seq1_plantae',
 'sequence': 'ttctaggagcatgtatatctatgctaatccgaatggaattagctcaaccaggtaaccatttgcttttaggtaatcaccaagtatacaatgttttaattacagcacatgcttttttaatgattttttttatggtaatgcctgtaatgattggtggttttggtaattggttagttcctattatgataggaagtccagatatggcttttcctagactaaataacatatctttttgacttcttccaccttctttatgtttacttttagcttcttcaatggttgaagtaggtgttggaacaggatgaactgtttatcctccccttagttcgatacaaagtcattcaggcggagctgttgatttagcaatttttagcttacatttatctggagcttcatcgattttaggagctgtcaattttatttctacgattctaaatatgcgtaatcctgggcaaagcatgtatcgaatgccattatttgtttgatctatttttgtaacggca',
 'plus': '+',
 'quality': '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [38]:
from alfie import ex_fasta_file

ex_fa = read_fasta(ex_fasta_file)
#or access as relative import in repository
#ex_fa = read_fasta('../alfie/data/example_data.fasta')

ex_fa[0]

{'name': 'seq1_plantae\n',
 'sequence': 'TTCTAGGAGCATGTATATCTATGCTAATCCGAATGGAATTAGCTCAACCAGGTAACCATTTGCTTTTAGGTAATCACCAAGTATACAATGTTTTAATTACAGCACATGCTTTTTTAATGATTTTTTTTATGGTAATGCCTGTAATGATTGGTGGTTTTGGTAATTGGTTAGTTCCTATTATGATAGGAAGTCCAGATATGGCTTTTCCTAGACTAAATAACATATCTTTTTGACTTCTTCCACCTTCTTTATGTTTACTTTTAGCTTCTTCAATGGTTGAAGTAGGTGTTGGAACAGGATGAACTGTTTATCCTCCCCTTAGTTCGATACAAAGTCATTCAGGCGGAGCTGTTGATTTAGCAATTTTTAGCTTACATTTATCTGGAGCTTCATCGATTTTAGGAGCTGTCAATTTTATTTCTACGATTCTAAATATGCGTAATCCTGGGCAAAGCATGTATCGAATGCCATTATTTGTTTGATCTATTTTTGTAACGGCA'}

### Classify sequences

In [None]:
seq_records, predictions = classify_records(ex_fasta_file)


In [None]:
#make new headers

In [None]:
write_fasta()


## Train and test a custom, alignment-free taxonomic classifier with alfie helper functions

In addition to using alfie as a kingdom level classifier, alfie's helper functions can also aid you in training a custom DNA barcode classification model.

Some common applications of this functionality may be the training of a classifier for a sub-group of interest (i.e. an mollusca classifier similar to the annelid example we are constructing here), or training a binary classifier to isolate barcodes from a specific taxonomic group (i.e. a classifier that says whether an input sequence is or is not from a teleost fish).

In [15]:
import numpy as np
import pandas as pd

import tensorflow as tf

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelBinarizer



In [7]:
from alfie.kmerseq import KmerFeatures
from alfie.training import stratified_taxon_split, sample_seq, process_sequences, alfie_dnn_default

### loading the demo data

The demo data can be found in the [alfie GitHub repository](https://github.com/CNuge/alfie/tree/master/alfie/data). The relative import below assumes you have downloaded the alfie repository from gihub and that your working directory is: `alfie/example`.

In [4]:
data = pd.read_csv('../alfie/data/alfie_small_train_example.tsv', sep = '\t')

In [5]:
data.head()

Unnamed: 0,processid,sequence,phylum,class,order,family,genus
0,GAHAP309-13,accttatactttattctgggcgtatgagcaggaatattgggtgcag...,Annelida,Clitellata,Enchytraeida,Enchytraeidae,Grania
1,GAHAP2002-14,accctatatttcattctcggagtttgagctggcatagtaggtgccg...,Annelida,Clitellata,Haplotaxida,Lumbricidae,Aporrectodea
2,GBAN15302-19,actctatacttaatttttggtatt-gagccggtatagtaggaacag...,Annelida,Clitellata,Haplotaxida,Naididae,Ainudrilus
3,GBAN11905-19,acactatattttattttaggaatttgagctggaataattggagcag...,Annelida,Clitellata,Crassiclitellata,Megascolecidae,Metaphire
4,GBAN15299-19,acattatacctaattta-ggtgtatgagccggaatagttggaacag...,Annelida,Clitellata,Haplotaxida,Naididae,Ainudrilus


For similicity, the demo is conducted with 10,000 sequences from the phylum Annelida, which has only two classes. We will train a model to predict the class for Annelida sequences. 


In [6]:
data['class'].value_counts()

Clitellata    6187
Polychaeta    3813
Name: class, dtype: int64

### Conducting a train/test split


The aflie function `stratified_taxon_split` can be used to split a dataframe in a stratified fashion based on the taxnomic data in a column. This ensures that each taxonomic group is evenly represented in the training and test data.

In [8]:
train, test = stratified_taxon_split(data, class_col = 'class', test_size = 0.3, )

Conducting train/test split, split evenly by: class


We can call some summary functions on the output dataframes to verify the even split of the data.

(7000, 7)

In [10]:
test.shape

(3000, 7)

In [24]:
print("train data shape:",train.shape)
print(train['class'].value_counts())
print("\n")

print("test data shape:", test.shape)
print(test['class'].value_counts())


train data shape: (7000, 7)
Clitellata    4331
Polychaeta    2669
Name: class, dtype: int64


test data shape: (3000, 7)
Clitellata    1856
Polychaeta    1144
Name: class, dtype: int64


In [25]:
test['class'].value_counts()

Clitellata    1856
Polychaeta    1144
Name: class, dtype: int64

### Encoding the response data


In [26]:
print("encoding y arrays")
#encode the y labels
y_train_raw =  train['class']
y_test_raw = test['class']

tax_encoder = LabelBinarizer()

y_train = tax_encoder.fit_transform(y_train_raw)
y_test = tax_encoder.transform(y_test_raw)



encoding y arrays


In [27]:
print(y_train.shape)
print(y_test.shape)


(7000, 1)
(3000, 1)


In [None]:
"""
# uncomment these lines to save the arrays to your computer
print("saving y to files")
np.save('example_y_train.npy', y_train)
np.save('example_y_test.npy', y_test)
"""

### Encoding the predictor data


#### kmer features

In [29]:
?KmerFeatures

Object `KmerFeatures` not found.


In [None]:
x1 = KmerFeatures(name = train['processid'][0]  , sequence = train['sequence'][0])

#### random subsampling

In [None]:


#demonstrate on a single sequence - i.e. row 1 of the train

sub_seq = sample_seq(train['sequence'][0])



can be used for upsampling as well

In [None]:
sub_seq = sample_seq(train['sequence'][0], n = 10)


#### batch processing

Here the data are processed with the defaults kmer size (`k = [4]`) and a single subsample per sequence in the input dataframe (`n = 1`).

In [None]:
train_kmer_data = process_sequences(train, label_col = 'class')
test_kmer_data = process_sequences(train, label_col = 'class')

The outputs dictonaries of the process_sequences function can be easily turned into numpy arrays, which are compatible inputs for most machine learning algorithms.

In [None]:
print("building X arrays")
X_train = np.array(train_samples['data'])
X_test = np.array(test_samples['data'])


In [None]:
"""
# uncomment these lines to save the arrays to your computer
print("saving X to files")
np.save('../train/fouronly_single_final_kingdom_X_train.npy', X_train)
np.save('../test/fouronly_single_final_kingdom_X_test.npy', X_test)
"""