# Extração de features

In [1]:
import os

import pandas as pd

import utils.feature_extraction as fe

In [2]:
dsfiles = {
  'PresRAT_sRNA' : ('data/PresRAT/raw/sRNA_total.100cl_1.fasta', 'data/PresRAT/sRNA_total.100cl_1.csv'),
  'RegulonDB_sRNASet' : ('data/RegulonDB/sRNASet.fa', 'data/RegulonDB/sRNASet.csv'),
  'SRD_sRNA_JKD6008' : ('data/SRD/raw/sRNA_JKD6008.fasta', 'data/SRD/sRNA_JKD6008.csv'),
  'SRD_sRNA_N315' : ('data/SRD/raw/sRNA_N315.fasta', 'data/SRD/sRNA_N315.csv'),
  'SRD_sRNA_NCTC8325' : ('data/SRD/raw/sRNA_NCTC8325.fasta', 'data/SRD/sRNA_NCTC8325.csv'),
  'SRD_sRNA_Newman' : ('data/SRD/raw/sRNA_Newman.fasta', 'data/SRD/sRNA_Newman.csv'),
  'SRD_sRNA_USA300_FPR3757' : ('data/SRD/raw/sRNA_USA300_FPR3757.fasta', 'data/SRD/sRNA_USA300_FPR3757.csv'),
  'CoryneRegNet': ('data/CoryneRegNet/CoryneRegNet_rna.fa', 'data/CoryneRegNet/CoryneRegNet_rna.csv'),
  'CoryneRegNet-exp': ('data/CoryneRegNet/CoryneRegNet_rna_exp.fa', 'data/CoryneRegNet/CoryneRegNet_rna_exp.csv'),
  'CoryneRegNet-pred': ('data/CoryneRegNet/CoryneRegNet_rna_pred.fa', 'data/CoryneRegNet/CoryneRegNet_rna_pred.csv'),
  'Rfam': ('data/Rfam/raw.fa', 'data/Rfam/raw.csv'),
  'Rfam_bacteria': ('data/Rfam/bacteria.fa', 'data/Rfam/bacteria.csv')
}

In [3]:
def extraction_features(input, output, label):
  if not os.path.isfile(output):
    # with tempfile.TemporaryDirectory() as tmpdirname:
    tmpdirname = './tmp'

    tmp_files = list(map(lambda n: tmpdirname + '/' +
                      label + '_' + str(n) + '.tmp', range(5)))
    fasta_dna = tmpdirname + '/' + label + '_dna.fa'
    fasta_cleaned = tmpdirname + '/' + label + '_cleaned.fa'

    # prepare
    fe.convert_to_dna(input, fasta_dna)
    fe.sequence_cleaner(fasta_dna, fasta_cleaned, min_length=0)

    # extract features
    fe.calc_length(label, fasta_cleaned, tmp_files[0])
    fe.calc_kmers(label, fasta_cleaned, tmp_files[1], size=3)
    fe.calc_fickettScore(label, fasta_cleaned, tmp_files[2])
    fe.calc_codingClass(label, fasta_cleaned, tmp_files[3])
    fe.merge(tmp_files[:4], tmp_files[4])

    # fe.rmfiles([fasta_dna, fasta_cleaned, output])

    # to float precision
    pd.read_csv(tmp_files[4])\
      .to_csv(output, index=False, float_format='%.5f')

    fe.rmfiles(tmp_files)

for k, (fa_input, csv_output) in dsfiles.items():
  extraction_features(fa_input, csv_output, k)

fe.convert_to_dna data/Rfam/bacteria.fa
sed '/^[^>]/s/u/t/g' data/Rfam/bacteria.fa | sed '/^[^>]/s/U/T/g' > ./tmp/Rfam_bacteria_dna.fa
fe.sequence_cleaner ./tmp/Rfam_bacteria_dna.fa
fe.calc_length ./tmp/Rfam_bacteria_0.tmp
printf "nameseq\tlength\n" > /tmp/tmpal_90c_c/tmp1.csv
seqkit fx2tab -lin ./tmp/Rfam_bacteria_cleaned.fa >> /tmp/tmpal_90c_c/tmp1.csv
awk 'BEGIN{FS="\t";OFS=","} {print $1,$2}' /tmp/tmpal_90c_c/tmp1.csv > ./tmp/Rfam_bacteria_0.tmp
fe.calc_kmers ./tmp/Rfam_bacteria_1.tmp
[ -e ./tmp/Rfam_bacteria_1.tmp ] && rm ./tmp/Rfam_bacteria_1.tmp
(echo 3 | (conda run -n mathfeature-terminal --no-capture-output python /home/alisson/work/MathFeature/methods/ExtractionTechniques.py -o ./tmp/Rfam_bacteria_1.tmp -l Rfam_bacteria -t kmer -seq 1 -i ./tmp/Rfam_bacteria_cleaned.fa > /dev/null))
fe.calc_fickettScore ./tmp/Rfam_bacteria_2.tmp
[ -e ./tmp/Rfam_bacteria_2.tmp ] && rm ./tmp/Rfam_bacteria_2.tmp
conda run -n mathfeature-terminal python /home/alisson/work/MathFeature/methods/Ficke