# Description

Create summary from NCBI RAPT pipeline results files.

**IN:** Result folders from NCBI RAPT pipline with:
- assembly_stat_report.tsv
- annot.gbk
- ani-tax-report.xml

**Out:**
    CSV summary

**Author:** Edgars Liepa

**Email:** edgars.liepa@biomed.lu.lv

Developed at Latvian Biomedical Research and Study center

03.11.2022

In [None]:
# Get list of foldes in local directory with sample results
import os
for root, dirs, files in os.walk(".", topdown=False):
   # for name in files:
   #    print(os.path.join(root, name))
   for name in dirs:
      print(os.path.join(root, name))

## Get assembly stats
From assembly_stat_report.tsv get:
- Total sequence count

In [None]:
import pandas as pd

def assembly(root, name):

    assembly_stat = pd.read_csv(os.path.join(root, name)+'/assembly_stat_report.tsv', sep="	")

    return assembly_stat["Total_seqs"][0], assembly_stat["Total_len(bp)"][0]


## Get longest conting

In [None]:
def getContig(root, name, longest_contig):

    from Bio import SeqIO
    
    for seq_record in SeqIO.parse(os.path.join(root, name)+'/annot.gbk', "genbank"):
        if (longest_contig == len(seq_record)):
            return seq_record
    
    return 0

## Get predicted taxa

Parse ani-tax-report.xml from NCBI RAPT results and get predicted bacteria taxonomical name

In [None]:
from lxml import etree

def getTaxa(root, name):

    if ".git/" in os.path.join(root, name):
            return
    tree = etree.parse(os.path.join(root, name)+'/ani-tax-report.xml')

    # print(len(tree.error_log))
    tree.getroot()

    result = etree.tostring(tree.getroot(),pretty_print=True, method="html")
    result

    submitted_taxid = tree.getroot()[0][0]
    predicted_taxid = tree.getroot()[0][1]
    
    return submitted_taxid, predicted_taxid, tree.getroot()[1][0].get("ANI"), tree.getroot()[1][0].get("query_pct_coverage")

## Create summary files

In [None]:
# Print Results
import csv
import sys
import os

header = ['Sample Name','predicted_taxid (NCBI)', 'submitted_taxid (Kraken)', 'Predicted taxa confidance', 'Average Nucleotide Identity (ANI)' , "query_pct_coverage", 'Conting Count', 'Total Sequence Length',] # 'CDSs', 'rRNAs']

# create DataFrame with samples names and result ID_s
file_names = pd.read_csv('NCBI_rez_names.csv', sep=",")

with open('anotationStat.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

    for root, dirs, files in os.walk(".", topdown=False):
        for name in dirs:

            # os.walk includes .git subdirectories. Only RAPT result folders needed, so rest should be ignored
            if ".git" in os.path.join(root, name):
                continue

            print('parse:',os.path.join(root, name))
            
            submitted_taxid, predicted_taxid, ani, query_pct_coverage = getTaxa(root, name)
            print(submitted_taxid.tag, ":", submitted_taxid.get("org-name"), "({})".format(submitted_taxid.get("rank")))
            print(predicted_taxid.tag, ":", predicted_taxid.get("org-name"), "({})".format(predicted_taxid.get("rank")))
            
            totelSeq, total_seq_len = assembly(root, name)

            print(totelSeq, total_seq_len)


            # seq_record = getContig(root, name, longest_contig)
            
            writer.writerow([file_names.loc[file_names['NCBI RAPT NAME'] == name]['SAMPLE NAME'].item(),predicted_taxid.get("org-name"), submitted_taxid.get("org-name"), predicted_taxid.get("confidence"), ani, query_pct_coverage, totelSeq, total_seq_len ])

            

## Get gene products

Parse .gbk file and extract Gene; CDS; rRNA; tRNA; ncRNA; products

In [None]:
import csv
import sys
import os
from Bio import SeqIO

# Set First flag to print organism name at the top of the file
first = True

for root, dirs, files in os.walk("/home/edgars.liepa/Becteria result", topdown=False):    
    for name in dirs:
    
        # os.walk includes .git subdirectories. Only RAPT result folders needed, so rest should be ignored
        if ".git" in os.path.join(root, name):
            continue
    
        print('parse:',os.path.join(root, name))
        
        with open(os.path.join(root, name)+'/geneProducts.csv', 'w', encoding='UTF8', newline='') as f:
            
            writer = csv.writer(f)
            # writer.writerow(header)
            
            for record in SeqIO.parse(os.path.join(root, name)+'/annot.gbk', "genbank"):
                
                if (first == True):
                    writer.writerow([record.annotations['organism'], "GENES TOTAL: ", record.annotations['structured_comment']['Genome-Annotation-Data']['Genes (total)']])
                    first = False
                
                print(record.id)
                writer.writerow(["Contig ID" + record.id ])
            
                for feature in record.features:
                    if ('product') in feature.qualifiers:
                        print(feature.type, feature.qualifiers['product'], feature.location)
                        writer.writerow([feature.type,feature.qualifiers['product'], feature.location])