**ITSs diff kmers**

In [2]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from os import listdir
from pandas import read_csv, DataFrame
from tqdm import tqdm
from subprocess import call
from Bio.SeqIO import parse
from skbio.stats.composition import clr 


In [None]:
pip install -U kaleido

In [3]:
def create_fasta(output, mearged_pike_out, dbpath):
    # Create fasta file 
    consensus = {}
    cons_conter = 0
    
    with open(f'{output}/all_consensus.fasta', 'w') as opn_fasta:
        for cons in mearged_pike_out.index:
    
            opn_fasta.write(f'>{cons_conter}\n{cons}\n')
            consensus[cons_conter] = cons
            cons_conter += 1
    
    return consensus
    
def run_blast(base, path):
    
    call(f'makeblastdb -in {base} -dbtype nucl', shell=True)
    call(f'blastn -num_threads 60  -outfmt "7 qseqid sseqid pident evalue qcovs bitscore" -query {path}/all_consensus.fasta  -db {base} -out {path}/blast_results.txt', shell=True)
 #   pass
def decode_tax(base) -> dict:
    
    # DB decoder 
    # Use db header format: Kingdom    Phylum    Class    Order    Family    Genus    Species
    
    base = parse(base, 'fasta')
    taxonomy_linage = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
    tax_decoder = {}
    
    for line in tqdm(base):
        
        tax_decoder[line.id] = {}
        linage = line.description.split(';')
        linage[0] = linage[0].split()[1]
    
        for i in range(len(taxonomy_linage)):
            try:
                if taxonomy_linage[i] != 'Species':
                
                    tax_decoder[line.id][taxonomy_linage[i]] = linage[i]
                
                else:
                    #print(linage[i].split())
                    tax_decoder[line.id][taxonomy_linage[i]] = ' '.join(linage[i].split()[:2])
                  
            except:
                
                tax_decoder[line.id][taxonomy_linage[i]] = 'NA'
    
    return tax_decoder

def parse_blast(path, 
                base, 
                data_tax, 
                consensus, 
                identity_filter, 
                cov_lim, 
                evalue_filter):
    
    # parser of blast table
    
    blast_header = ['qseqid',
                    'sseqid', 
                    'pident',
                    'evalue',
                    'qcovs', 
                    'bitscore']
    
    blasting_results = {}
    opn_blast = read_csv(f'{path}/blast_results.txt', sep='\t', comment='#', header=None, names=blast_header)
    
    for i in tqdm(opn_blast['qseqid'].unique()):
        
        blast_subset = opn_blast[opn_blast["qseqid"] == i]
        blast_subset = blast_subset[blast_subset['pident'] >= identity_filter]
        blast_subset = blast_subset[blast_subset['evalue'] <= evalue_filter]
        blast_subset = blast_subset[blast_subset['qcovs'] >= cov_lim]

        blast_subset = blast_subset.sort_values(by='evalue')
        #blast_subset = blast_subset.sort_values(by='pident')[::-1]

        if len(blast_subset['sseqid'].values) == 0:
            continue
            
        subject = blast_subset['sseqid'].values[0]
        blasting_results[consensus[i]] = data_tax[subject]
        
    blasting_results_df = DataFrame(blasting_results).T
    
    return blasting_results_df
    
def processing_data_tax(data_tax):

    data_tax_df = DataFrame(data_tax).T.fillna(0)
    # Add pseudocunt
    # data_tax_df = data_tax_df + 1
    data_tax_df = data_tax_df.assign(m=data_tax_df.mean(axis=1)).sort_values('m').drop('m', axis=1)[np.sort(data_tax_df.columns)]

    return  data_tax_df
    
def get_taxonomy(data_tax, 
                 blasting_results_df, 
                 mearged_pike_out, 
                 tax_level='OTU'):
    
    data_tax = {}
    avs = np.intersect1d(blasting_results_df.index, mearged_pike_out.index)
    count = 1
    OTU_decoder  = {'Seq': [], 'OTU_name' : []}
    
    for av in tqdm(avs):

        if tax_level == 'OTU':
        
            tax = f'OTU_{count}_{blasting_results_df["Species"][av]}'
        else:    
            tax = blasting_results_df[tax_level][av]
        count += 1
        OTU_decoder['Seq'].append(av)
        OTU_decoder['OTU_name'].append(tax)
        if tax == 'nan':
            
            tax = 'No Fungi'

        if tax not in data_tax.keys():
    
            data_tax[tax] = {col: 0 for col in mearged_pike_out.columns} 
        
        for col in mearged_pike_out.columns:
           
            data_tax[tax][col] += mearged_pike_out[col][av]
    
    data_tax_df = processing_data_tax(data_tax)
    
    return data_tax_df, OTU_decoder
    

def filter_data(output, 
                dbpath,
                mearged_pike_out,
                taxonomy_level, 
                identity_filter=0, 
                cov_lim=0, 
                evalue_filter=1e-10):

    # Creating output directory
    try:
        
        os.mkdir(output)
        
    except FileExistsError:
        
        print('The output directory already exists!')
        
    consensus = create_fasta(output, mearged_pike_out, dbpath)
    run_blast(dbpath, output)
    data_tax = decode_tax(dbpath)
    blasting_results_df = parse_blast(output, 
                                      dbpath, 
                                      data_tax, 
                                      consensus, 
                                      identity_filter, 
                                      cov_lim, 
                                      evalue_filter)

  #  mearged_pike_out = filter_av(mearged_pike_out, prevalence, detection, slice)
    data_tax_df, OTU_decoder = get_taxonomy(data_tax, 
                                            blasting_results_df, 
                                            mearged_pike_out,
                                            taxonomy_level)

    data_tax_df = data_tax_df[mearged_pike_out.columns]
    for col in data_tax_df.columns:
        
        data_tax_df[col] = data_tax_df[col] / np.sum(data_tax_df[col].values)
    
    data_tax_df = data_tax_df.fillna(0)[mearged_pike_out.columns]   
    data_tax_df = data_tax_df.assign(m=data_tax_df.mean(axis=1)).sort_values('m').drop('m', axis=1)


    return data_tax_df, data_tax, blasting_results_df, DataFrame(OTU_decoder)

In [3]:
import random
def get_color(obj_dict):
    
    color = ''
    
    while color not in obj_dict.values() and color == '':
        
        color = "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
    
    return color

MERGING ALL TABLES INTO ONE

In [12]:
mearged_otu_table = []

for kmer_type in ['_k_3', '_k_4','_k_5', '_k_6','_k_7']:
#for kmer_type in ['_3_kmer_size', '_4_kmer_size', '_5_kmer_size', '_6_kmer_size', '_7_kmer_size']:    
    for sample in listdir(f'/mnt/AsusShareI2/RUNS/runs-sonec/pool_diff_kmers/ITS/pool_ITS_full{kmer_type}/results/'):
        opn_res = read_csv(f'/mnt/AsusShareI2/RUNS/runs-sonec/pool_diff_kmers/ITS/pool_ITS_full{kmer_type}/results/{sample}/results.tsv', sep='\t', index_col=0)
        if 'Count' in opn_res.columns:
            count = 0
            mearged_otu_table.append(DataFrame(data=opn_res['Count'].tolist(), index=opn_res.index, columns=[sample + kmer_type]))
            with open(f'/mnt/AsusShareI2/RUNS/runs-sonec/pool_diff_kmers/ITS/pool_ITS_full{kmer_type}/{sample}.fasta', 'w') as opn_fasta:
                for line in opn_res.index:
                    opn_fasta.write(f'>{count}_{opn_res["Count"][line]}\n{line}\n')
                    count += 1
        else:
            print(f'Столбец "Count" отсутствует в результате для образца {sample + amplicon_type}')

mearged_otu_table = pd.concat(mearged_otu_table, axis=1).fillna(0)
mearged_otu_table = mearged_otu_table.reindex(sorted(mearged_otu_table.columns), axis=1)
mearged_otu_table.to_csv('pool_ITS_12_kmers_3_7_merged_otu_table_all_samples_new_db.csv')

In [4]:
mearged_otu_table.head(10)

Unnamed: 0,ITS1_5.8_ITS2_1_k_3,ITS1_5.8_ITS2_1_k_4,ITS1_5.8_ITS2_1_k_5,ITS1_5.8_ITS2_1_k_6,ITS1_5.8_ITS2_1_k_7,ITS1_5.8_ITS2_2_k_3,ITS1_5.8_ITS2_2_k_4,ITS1_5.8_ITS2_2_k_5,ITS1_5.8_ITS2_2_k_6,ITS1_5.8_ITS2_2_k_7,...,ITS1_5.8_ITS2_R2_k_3,ITS1_5.8_ITS2_R2_k_4,ITS1_5.8_ITS2_R2_k_5,ITS1_5.8_ITS2_R2_k_6,ITS1_5.8_ITS2_R2_k_7,ITS1_5.8_ITS2_R3_k_3,ITS1_5.8_ITS2_R3_k_4,ITS1_5.8_ITS2_R3_k_5,ITS1_5.8_ITS2_R3_k_6,ITS1_5.8_ITS2_R3_k_7
TTAAGTTCAGCGGGTAATCTCGGATGAAGGAGGTGAAATGACGTAATAATTGAGAGTTTAAAAAAATCCATTTCAAGAAAGCAATGCGATCCCAGAGGGAACACGCTCCTCCATCCGTGCGGTACGGAATGCCATACCGCGCAATGTGCGTTCAAAGATTGATGATTCACATCTGCAAGTCACAAAAAATATCGCGGTTCGCTGCGTTCTTCATCGATGTGAGAGCCAAGAGATCCGTTGTTGATAGTTATAATTGAGATAAAATGACGCTGTAATAGAATAGAAATCCACAGAAATAGATAAAATCAATAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTACTTCCATCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTATTCCTACCTGATTTGAGGTCGAGCTTTTTGTTGTCTCGCAACACTCGCTCTCGGCCGCCAAGCGTCCCTGAAAAAAGTCTAGTTCGCTCGGCCAGCTTCGCTCCCTTTCAGGCGAGTCGCAGCTCCGACGCTCTTTACACGTCGTCCGCTCCGCTCCCCCAACTCTGCGCACGCGCAAGATGGAAACGACGCTCAAACAGGCATGCCCCCCGGAATGCCGAGGGGCGCAATGTGCGTTCAAGAACTCGATGATTCACGATGGCTGCAATTCACACTAGGTATCGCATTTCGCTGCGCTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTGTTTGTTTTTTCGTAGATTTCTCTTGTCGACTATATGCTATATTCCACATTTTAGGTGTTGTTGTTTTCGTTCCGCTCACGCAGTGTAGTAGTAAATCACAGTAATGATCCTTCCGCAGGTTCACCTACGGAAA,45.0,45.0,45.0,45.0,45.0,67.0,67.0,67.0,67.0,67.0,...,50.0,51.0,51.0,51.0,51.0,124.0,128.0,127.0,126.0,125.0
TTAAGTTCAGCGGGTATTCCTACCTGATTTGAGGTCAAACTTGTTTGGTTGTTGTAAGCCGCCAACAATAAGAAATATTGTCGCCACACCATTCAACGAGTTGGATAAACCTAAATACATTGAGATATAATATCTTCTACCATGCCAATATTTTTCAAGCAAACGCCTAGTCAAGAGTATCACTCAATACCAAACCCGAGGGTTTGAGAGAGAAATGACGCTCAAACAGGCATGCCCTTTGGAATACCAGAGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGAAAATCTGCAATTCATATTACTTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTGAAGATTAATTCAATAATTTGATTAACTGTAAAAATAATTAAATTTTGTTTGTTAAATCTCTGGCCCAATTTATATATAGGCCAAACCAAAGCAAAATTTTTCTATCAAAGAAGACACATGTGTAAGTTGATGCGCAGTTAAGCGTGAAATCTGTAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTACTTCCATCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTATTCCTACCTGATTTGAGGTCAAACTTGTATAGTTGTTGTAAGAGATACAACAATATTAGAAATATTTATACCATTCAACGAGTTGGAAAAACCTAATACATTGGAGGTAGACAGCACTATCTTGTACTACGCATGCTAATTTTTTTCAAGCAAACCTAGTCACCTAAGAGTATTACTCAACACCAAACCCGAGGGTTTGAGAGAGAAATGACGCTCAAACAGGCATGCCCTTTGGAATACCAGAGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGAATATCTGCAATTCATATTACTTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTGAAGATTAATTCAAAATCTGATTAACTGTAAAAATAATTAAAATGTGGTTTTGTTTAATCTCTGGCACAACCTATCTCTAAGCCAAACCAAAGCAAGTTTTTAAAACAAAAAAACACATGTGTAAAAAATTATGCAGTTAAGCGCTAATAATCTGTAATGATCCTTCCGCAGGTTCACCTACGGAAA,167.0,0.0,0.0,0.0,0.0,127.0,0.0,0.0,0.0,0.0,...,41.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0
TTAAGTTCAGCGGGTAATCTCGGATGAAGGAGGTGAAATGACGTAATAATTGAGAGTTTAAAAAATCCATTTCAAGAAAGCAATGCGATCCCAGAGGGAACACGCTCCTCCATCCGTGCGGTACGGAATGCCATACCGCGCAATGTGCGTTCAAAGATTGATGATTCACATCTGCAAGTCACAAAAAAATCGCGGTTCGCTGCGTTCTTCATCGATGTGAGAGCCAAGAGATCCGTTGTTGATAGTTATAATTGAGATAAAATGACGCTGTAATAGAATAGAAATCCACAGAAATAGATAAAATCAATAATGATCCTTCCGCAGGTTCACCTACGGAAA,5.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,13.0,0.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0
TTAAGTTCAGCGGGTAATCTCGGATGAAGGAGGTGAAATGACGTAATAATTGAGAGTTTAAAAAATCCATTTCAAGAAAGCAATGCGATCCCAGAGGGAACACGCTCCTCCATCCGTGCGGTACGGAATGCCATACCGCGCAATGTGCGTTCAAAGATTGATGATTCACATCTGCAAGTCACAAAAAATATCGCGGTTCGCTGCGTTCTTCATCGATGTGAGAGCCAAGAGATCCGTTGTTGATAGTTATAATTGAGATAAAATGACGCTGTAATAGAATAGAAATCCACAGAAATAGATAAAATCAATAATGATCCTTCCGCAGGTTCACCTACGGAAA,999.0,4189.0,9.0,3378.0,1016.0,1020.0,4147.0,12.0,3372.0,1023.0,...,1219.0,4901.0,13.0,3695.0,1196.0,2047.0,8289.0,18.0,6295.0,2144.0
TTAAGTTCAGCGGGTAATCTCGGATGAAGGAGGTGAAATGACGTAATAATTGAGAGTTTAAAAAAATCCATTTCAAGAAAGCAATGCGATCCCAGAGGGAACACGCTCCTCCATCCGTGCGGTACGGAATGCCATACCGCGCAATGTGCGTTCAAAGATTGATGATTCACATCTGCAAGTCACAAAAAATATCGCGGTTCGCTGCGTTCTTCATCGATGTGAGAGCCAAGAGATCCGTTGTTGATAGTTATAATTGAGATAAAATGACGCTGTAATAGAATAGAAATCCACAGAAATAGATAAAATCAATAATGATCCTTCCGCAGGTTCACCTACGGAAA,3288.0,450.0,4653.0,584.0,3245.0,3179.0,398.0,4571.0,511.0,3139.0,...,3357.0,343.0,5249.0,497.0,3525.0,5488.0,517.0,8768.0,782.0,5778.0
TTAAGTTCAGCGGGTATTCCTACCTGATTTGAGGTCGAGCTTTTTGTTGTCTCGCAACACTCGCTCTCGGCCGCCAAGCGTCCCTGAAAAAAGTCTAGTTCGCTCGGCCAGCTTCGCTCCCTTTCAGGCGAGTCGCAGCTCCGACGCTCTTTACACGTCGTCCGCTCCGCTCCCCCAACTCTGCGCACGCGCAAGATGGAAACGACGCTCAAACAGGCATGCCCCCCGGAATGCCGAGGGGCGCAATGTGCGTTCAAGAACTCGATGATTCACGATGGCTGCAATTCACACTAGGTATCGCATTTCGCTGCGCTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTGTTTGTTTTTTCGTAGATTTCTCTTGTCGACTATATGCTATATTCCACATTTTAGGTGTTGTTGTTTTCGTTCCGCTCACGCAGTGTAGTAGTAAATCACAGTAATGATCCTTCCGCAGGTTCACCTACGGAAA,2337.0,2343.0,0.0,1968.0,0.0,2881.0,2890.0,0.0,2410.0,0.0,...,2051.0,2057.0,0.0,1696.0,0.0,3448.0,3462.0,0.0,2881.0,0.0
TTAAGTTCAGCGGGTAGTCCTACCTGATTTGAGGTCAAAGTTTGAAGATATACGTGGTGGACGTTACCGCCGCAAACAATGTTTTTGGTTAGACCTAAGCCATTGTCAAAGCGATCCCGCCTTACCACTACCGTCTTTCAAGCAAACCCAAGTCGTATTGCTCAACACCAAACCCAGCGGTTTGAGGGAGAAACGACGCTCAAACAGGCATGCCCTCCGGAATACCAGAGGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGAATATCTGCAATTCATATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTGACTATTAGTAATAATCTGGTGTGACAAGTTGATAAAAAATTGGTTGTAAGTTTAGACCTCTGGCGGCAGGCTGGGCCCACCGCCAAAGCAAGTTTGTTTCAAAGAAAAACACATGTGGTGCAATTAAGCAAATCAGTAATGATCCTTCCGCAGGTTCACCTACGGAAA,3126.0,3057.0,3028.0,3016.0,2917.0,2588.0,2532.0,2517.0,2512.0,2433.0,...,1031.0,987.0,990.0,985.0,958.0,1732.0,1677.0,1666.0,1654.0,1612.0
TTAAGTTCAGCGGGTAGTCCTACCTGATTTGAGGCGACAACAAAACGAAAAAAAAGCGTAGATTTTTTTCGTGCAAGCTGTAATTTTGTGAATGCAACGCCACCGCGAAGATTGGTGAGAAGACATCACGCTCAAACAGGCATGCCTTGGGGAATACCACAAGGCGCAATGTGCGTTCAAAGATTCGATGATTCACATCTGCAAGTCATACTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTATGTTTTTGTTTTAGTTGAACTTAACGTTTGGTTAGTTTTAAAATCCAAATCAGTGTGTATGCAAAATATCAATAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTACTTCCATCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTAGTCCTACCTGATTTGAGGTCAAAGTTATGAAATAAATTGTGGTGGCCACTAGCAAAATAAGCGTTTTTGGATAAACCTAAGTCGCTTAAAATAAGTTTCCACGTAAATTCTTTCAAACAAACCTAGCGTATTGCTCAACACCAAACCCGGGGGTTTGAGGGAGAAATGACGCTCAAACAGGCATGCCCTGTGGAATACCAAAAGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGAATATCTGCAATTCATATTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTGACTATTGTAATAATAAATCAAGTTTGACTGTAAATAAAAAGTTTGGTTTAGTTATAACCTCTGGCGGTAGGATTGCTCCCGCCACCAAAGAAATTTGTTCAATAAAAAACACATGTGGTGCAATTAAGCAAATCAGTAATGATCCTTCCGCAGGTTCACCTACGGAAA,91.0,0.0,0.0,0.0,0.0,83.0,0.0,0.0,0.0,0.0,...,26.0,0.0,0.0,0.0,0.0,43.0,0.0,0.0,0.0,0.0
TTAAGTTCAGCGGGTAATCTCGGATGAAGGAGGTGAAATGACGTAATAATTGAGAGTTTAAAAAAATCCATTTCAAGAAAGCAATGCGATCCCAGAGGGAACACGCTCCTCCATCCGTGCGGTACGGAATGCCATACCGCAATGTGCGTTCAAAGATTGATGATTCACATCTGCAAGTCACAAAAAATATCGCGGTTCGCTGCGTTCTTCATCGATGTGAGAGCCAAGAGATCCGTTGTTGATAGTTATAATTGAGATAAAATGACGCTGTAATAGAATAGAAATCCACAGAAATAGATAAAATCAATAATGATCCTTCCGCAGGTTCACCTACGGAAA,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0
TTAAGTTCAGCGGGTAATCTCGGATGAAGGAGGTGAAATGACGTAATAATTGAGAGTTTAAAAAAATTCATTTCAAGAAAGCAATGCGATCCCAGAGGGAACACGCTCCTCCATCCGTGCGGTACGGAATGCCATACCGCGCAATGTGCGTTCAAAGATTGATGATTCACATCTGCAAGTCACAAAAAATATCGCGGTTCGCTGCGTTCTTCATCGATGTGAGAGCCAAGAGATCCGTTGTTGATAGTTATAATTGAGATAAAATGACGCTGTAATAGAATAGAAATCCACAGAAATAGATAAAATCAATAATGATCCTTCCGCAGGTTCACCTACGGAAACCTTGTTACGACTTTTACTTCCATCCTCCGCTTATTGATATGCTTAAGTTCAGCGGGTAGTCCTACCTGATTTGAGGCGACAACAAAACGAAAAAAAAGCGTAGATTTTTTCGTGCAAGCTGTAATTTTGTGAATGCAACGCCACCGCGAAGATTGGTGAGAAGACATCACGCTCAAACAGGCATGCCTTGGGGAATACCCCAAGGCGCAATGTGCGTTCAAAGATTCGATGATTCACGTCTGCAAGTCATACTACGTATCGCATTTCGCTGCGTTCTTCATCGATGCGAGAACCAAGAGATCCGTTGTTGAAAGTTTTATGTTTTTGTTTTAGTTGAACTTAACGTTGGGTTAGTTTTAAAATCCAAATCAGTGTGTATGCAAAATATCAATAATGATCCTTCCGCAGGTTCACCTACGGAAA,96.0,0.0,0.0,0.0,0.0,96.0,0.0,0.0,0.0,0.0,...,97.0,0.0,0.0,0.0,0.0,185.0,0.0,0.0,0.0,0.0


In [22]:
opn_barcode_features = pd.read_csv('/mnt/AsusShareI2/RUNS/runs-sonec/pool_diff_kmers/ITS/pool_ITS_full_k_7/work_dir/features/tsv/ITS1_5.8_ITS2_1.tsv', sep='\t', index_col=0)

In [27]:
len(eval(opn_barcode_features['K-mers signature'][4003]))

78125

In [21]:
len(opn_barcode_features['K-mers signature'][0])

392

**WORKING WITH SILVA AND BLAST**

**OTU**

In [15]:
output = f'/mnt/AsusShareI2/RUNS/runs-sonec/pool_diff_kmers/ITS/pool_ITS_full{kmer_type}/TAXONOMY'
#dbpath = '/mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta'
#dbpath = '/mnt/AsusSh/mnt/AsusShareI2/RUNS/runs-sonec/pool_diff_kmers/ITS/pool_ITS1areI2/RUNS/runs-sonec/dada_unite.fasta'
dbpath = '/mnt/AsusShareI2/RUNS/runs-sonec/unite_04.04.2024.fasta'
#taxonomy_level = 'Genus'
#taxonomy_level = 'Species'
taxonomy_level = 'OTU'

data_tax_df, data_tax, blasting_results_df, OTU_decoder = filter_data(output, 
                                                                     dbpath,
                                                                     mearged_otu_table,
                                                                     taxonomy_level, 
                                                                     identity_filter=0, 
                                                                     cov_lim=0, 
                                                                     evalue_filter=1e-10)
data_tax_df.to_csv('pool_kmers_3_7_ITS_12_data_tax_df_OTU.csv', sep='\t')

The output directory already exists!


Building a new DB, current time: 08/22/2024 16:10:30
New DB name:   /mnt/AsusShareI2/RUNS/runs-sonec/unite_04.04.2024.fasta
New DB title:  /mnt/AsusShareI2/RUNS/runs-sonec/unite_04.04.2024.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /mnt/AsusShareI2/RUNS/runs-sonec/unite_04.04.2024.fasta
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 93085 sequences in 3.97439 seconds.




93085it [00:01, 89334.19it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 819.97it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 229/229 [00:00<00:00, 6444.81it/s]


In [None]:
data_tax_df.head(30)

In [None]:
data_tax_no0_rows = (data_tax_df !=0).sum()
result = pd.DataFrame(data_tax_no0_rows).transpose()
result.head(10)
#result.index = ['OTU_count_nonzero']
#result.to_csv('OTU_count_nonzero.csv',sep='\t')

In [None]:
result_long = result.melt(var_name='column', value_name='Species_count_nonzero')
result_long

In [None]:
result_long['sample_name'] = result_long['column'].str.extract(r'(ITS1_5.8_ITS2_(?:R)?\d+)_\d+_reads')
result_long
#result_long.to_csv('result_long_test.csv',sep='\t')

In [None]:
result_long['number_of_reads'] = result_long['column'].str.extract(r'ITS1_5.8_ITS2_(?:R)?\d+_(\d+)_reads')
result_long.drop('column', axis=1, inplace=True)
result_long.fillna(0)
result_long['number_of_reads'] = result_long['number_of_reads'].astype('int')
result_long

In [None]:
result_long=result_long.sort_values('number_of_reads',ascending=True)

In [None]:
result_long
result_long.to_csv('ITS_12_species_count_long_table.csv', sep='\t')

**PLOTTING FROM RESULT_LONG DF( OPTIMAL)**

In [None]:
fig, ax = plt.subplots()
for sample in result_long['sample_name'].unique():
    subset = result_long[result_long['sample_name'] == sample]
    ax.plot(subset['number_of_reads'], subset['OTU_count_nonzero'], marker='o', linestyle='-', label=sample)

ax.set_xlabel('Number of reads')
ax.set_ylabel('Count')
ax.set_title('OTU')
ax.set_ylim(-1, 35)
ax.set_xscale('log')
#ax.xaxis.set_ticks(result_melted["number_of_reads"])
ax.legend()
plt.show()
#plt.savefig('OTU_V3_V4.png')

In [None]:
samples = pd.Series([col.rsplit('_', 2)[0] for col in result.columns]).drop_duplicates().tolist()
samples

In [None]:
reads = pd.Series([col.split('_')[-2] for col in result.columns]).drop_duplicates().tolist()
reads.sort()
reads

TESTING RESHAPING DF ON DUMMY DATA

In [None]:
# Assuming your initial dataframe is named df
data = {
    'V3_V4_1_100_reads': [10],
    'V3_V4_2_100_reads': [15],
    'V3_V4_3_100_reads': [20],
    'V3_V4_1_500_reads': [30],
    'V3_V4_2_500_reads': [30],
    'V3_V4_3_500_reads': [35]
}

df = pd.DataFrame(data)

# Melt the dataframe to go from wide to long format
df_long = df.melt(var_name='column', value_name='OTU_count_nonzero')

In [None]:
df

In [None]:
df_long

In [None]:
# Extract sample names and number of reads
df_long['sample_name'] = df_long['column'].str.extract(r'(V3_V4_\d+)_\d+_reads')
df_long

In [None]:
df_long['number_of_reads'] = df_long['column'].str.extract(r'V3_V4_\d+_(\d+)_reads')
df_long

In [None]:
# Drop the original column names as they are no longer needed
df_long.drop('column', axis=1, inplace=True)
df_long

In [None]:
# Pivot the table to get the desired format
df_pivot = df_long.pivot(index='number_of_reads', columns='sample_name', values='OTU_count_nonzero')
df_pivot

In [None]:
# Reset the index to get number_of_reads as a column
df_final = df_pivot.reset_index()
df_final

In [None]:
# Optionally, sort by number_of_reads if needed
df_final.columns

In [None]:
df_final.columns.name = None
# Print the final dataframe
print(df_final)

**GENUS**

PLOTS

In [None]:
Color_collection = {}

for i in data_tax_df.index:
    
    Color_collection[i] = get_color(Color_collection)

In [None]:
import plotly.express as px
import plotly.subplots as sp

fig = px.bar(data_tax_df.T, 
             x=data_tax_df.columns, 
             y=data_tax_df.index,
             width=1500, 
             height=900, 
            # color=data_tax_df.index,
             labels={'value': 'Relative abundance', 'index':'Samples'}, 
             template='simple_white',
             color_discrete_map=Color_collection)
fig.update_layout(yaxis_range=[0, 1], legend_title_text='Taxon', legend_title_side='top center')
fig.update_traces(marker_line_width=1.1, marker_line_color='#202020', opacity=0.8)
fig.update_yaxes(ticksuffix = "  ")
fig.update_xaxes(range=[-1, len(mearged_otu_table.T)+0.2], autorangeoptions_clipmax=len(data_tax_df.T))

#fig.update_layout(showlegend=False)
#os.mkdir("VIZ")
fig.write_image(f"VIZ/16S_{taxonomy_level}.pdf")
fig.write_image(f"VIZ/16S_{taxonomy_level}.png", scale=5)
fig.show()