# Description

Create summary from NCBI RAPT pipeline results files.

**IN:** Result folders from NCBI RAPT pipline with:
- assembly_stat_report.tsv
- annot.gbk
- ani-tax-report.xml

**Out:**
    CSV summary

In [106]:
# Get list of foldes in local directory with sample results
import os
for root, dirs, files in os.walk(".", topdown=False):
   # for name in files:
   #    print(os.path.join(root, name))
   for name in dirs:
      print(os.path.join(root, name))

./557259f-w06-e272_output
./41c95d3-5sy-4c24_output
./9095857-87h-bf63_output
./1c93b92-daa-0119_output
./c6376e7-zs3-f15e_output
./9f88d34-den-f79d_output
./14030d9-c6s-2d8b_output
./0d0a7f5-ohp-5f3f_output
./299ef09-kfr-1d26_output
./20670e9-ygs-94d0_output
./65e1f2c-266-3d1c_output
./2bccc4b-h24-b74f_output
./9c12216-8k1-c82a_output
./ceeec5e-wku-15fe_output
./95ed6de-u4q-0073_output
./ebada13-1sc-998b_output
./1a7b916-oyi-74d4_output
./584b677-23z-4771_output
./4284c3c-28h-6025_output
./3dc10cd-rma-91cd_output
./bab69c5-y4f-1d90_output
./9bbd667-cdx-18d2_output
./4064a86-wfd-7fdf_output
./4a77690-ln9-205f_output
./06543a2-xjb-58b4_output
./de99f17-h2m-9975_output
./3a960cc-z7g-6efb_output
./a128c84-v7q-266f_output
./5fcf272-f3y-4925_output
./9289583-236-4b70_output
./ab5ce9f-ki7-eb16_output
./ac828eb-jdv-3aee_output
./deca316-tnh-9af8_output


## Get assembly stats
From assembly_stat_report.tsv get:
- Total sequence count

In [119]:
import pandas as pd

def assembly(root, name):

    assembly_stat = pd.read_csv(os.path.join(root, name)+'/assembly_stat_report.tsv', sep="	")

    return assembly_stat["Total_seqs"][0], assembly_stat["Max_seq_len(bp)"][0]


## Get longest conting

In [110]:
def getContig(root, name, longest_contig):

    from Bio import SeqIO
    
    for seq_record in SeqIO.parse(os.path.join(root, name)+'/annot.gbk', "genbank"):
        if (longest_contig[0] == len(seq_record)):
            # print(seq_record.id)
            # print(repr(seq_record.seq))
            # print(len(seq_record))
            return seq_record
    
    return 0

## Get predicted taxa

In [117]:
from lxml import etree

def getTaxa(root, name):

    tree = etree.parse(os.path.join(root, name)+'/ani-tax-report.xml')

    # print(len(tree.error_log))
    tree.getroot()

    result = etree.tostring(tree.getroot(),pretty_print=True, method="html")
    result

    submitted_taxid = tree.getroot()[0][0]
    predicted_taxid = tree.getroot()[0][1]
    
    return submitted_taxid, predicted_taxid

## Create summary file

In [125]:
# Print Results
import csv
import sys
import os

header = ['predicted_taxid', 'submitted_taxid', 'Conting Count', 'largest contig (bp)', 'largest contig (seq)'] # 'CDSs', 'rRNAs']

with open('mutations.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

    for root, dirs, files in os.walk(".", topdown=False):
        for name in dirs:
            print('parse:',os.path.join(root, name))
            
            submitted_taxid, predicted_taxid = getTaxa(root, name)
            print(submitted_taxid.tag, ":", submitted_taxid.get("org-name"), "({})".format(submitted_taxid.get("rank")))
            print(predicted_taxid.tag, ":", predicted_taxid.get("org-name"), "({})".format(predicted_taxid.get("rank")))
            
            totelSeq, longest_contig = assembly(root, name)

            print(totelSeq, longest_contig)


            seq_record = getContig(root, name, longest_contig)
            
            writer.writerow([predicted_taxid.get("org-name"), submitted_taxid.get("org-name"), totelSeq[0], longest_contig[0], str(seq_record.seq) ])

            


    # for index, position in enumerate(mutationList):
    #     for aa in  position.keys(predicted_taxid.get("org-name")):
    #         if position[aa] != 0:
    #             print("Postion: ", index+1, "aaSubst: ", aa, "Count: ", position[aa])
    #             writer.writerow([index+1, aa, position[aa]])

parse: ./557259f-w06-e272_output
submitted-taxid : Bacillus pumilus (species)
predicted-taxid : Bacillus pumilus (species)
0    734
Name: Total_seqs, dtype: int64 0    60095
Name: Max_seq_len(bp), dtype: int64
Contig_392_27.4353
Seq('CTGCTCAGTGAGAGCGGCTTTCATTATGCAGAGAAATCATTCCGATTATTAAGC...CCA')
60095
parse: ./41c95d3-5sy-4c24_output
submitted-taxid : Pseudomonas fluorescens (species)
predicted-taxid : Pseudomonas kilonensis (species)
0    581
Name: Total_seqs, dtype: int64 0    117321
Name: Max_seq_len(bp), dtype: int64
Contig_117_21.2459
Seq('ATGCCTTGAGGAAATCCACTGCTTCATGGATCTCTCTGATCTCGACGATCCCCG...TTA')
117321
parse: ./9095857-87h-bf63_output
submitted-taxid : Streptomyces (genus)
predicted-taxid : [Kitasatospora] papulosa (species)
0    2433
Name: Total_seqs, dtype: int64 0    31978
Name: Max_seq_len(bp), dtype: int64
Contig_1796_19.7675
Seq('GGCCGGGAAGAGCGCGAGGGAGACGGCGGTCGGGTAGCAGCCGGGCACGGCGAT...CCC')
31978
parse: ./1c93b92-daa-0119_output
submitted-taxid : Streptomyces (genus)


KeyboardInterrupt: 