**V3_V4**

In [1]:
import numpy as np
import pandas as pd
import os
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns

from os import listdir
from pandas import read_csv, DataFrame
from tqdm import tqdm
from subprocess import call
from Bio.SeqIO import parse
from skbio.stats.composition import clr 


In [4]:
pip install -U kaleido

Collecting kaleido
  Using cached kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl.metadata (15 kB)
Using cached kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
Installing collected packages: kaleido
Successfully installed kaleido-0.2.1
Note: you may need to restart the kernel to use updated packages.


In [2]:
def create_fasta(output, mearged_pike_out, dbpath):
    # Create fasta file 
    consensus = {}
    cons_conter = 0
    
    with open(f'{output}/all_consensus.fasta', 'w') as opn_fasta:
        for cons in mearged_pike_out.index:
    
            opn_fasta.write(f'>{cons_conter}\n{cons}\n')
            consensus[cons_conter] = cons
            cons_conter += 1
    
    return consensus
    
def run_blast(base, path):
    
    call(f'makeblastdb -in {base} -dbtype nucl', shell=True)
    call(f'blastn -num_threads 60  -outfmt "7 qseqid sseqid pident evalue qcovs bitscore" -query {path}/all_consensus.fasta  -db {base} -out {path}/blast_results.txt', shell=True)
 #   pass
def decode_tax(base) -> dict:
    
    # DB decoder 
    # Use db header format: Kingdom    Phylum    Class    Order    Family    Genus    Species
    
    base = parse(base, 'fasta')
    taxonomy_linage = ['Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species']
    tax_decoder = {}
    
    for line in tqdm(base):
        
        tax_decoder[line.id] = {}
        linage = line.description.split(';')
        linage[0] = linage[0].split()[1]
    
        for i in range(len(taxonomy_linage)):
            try:
                if taxonomy_linage[i] != 'Species':
                
                    tax_decoder[line.id][taxonomy_linage[i]] = linage[i]
                
                else:
                    #print(linage[i].split())
                    tax_decoder[line.id][taxonomy_linage[i]] = ' '.join(linage[i].split()[:2])
                  
            except:
                
                tax_decoder[line.id][taxonomy_linage[i]] = 'NA'
    
    return tax_decoder

def parse_blast(path, 
                base, 
                data_tax, 
                consensus, 
                identity_filter, 
                cov_lim, 
                evalue_filter):
    
    # parser of blast table
    
    blast_header = ['qseqid',
                    'sseqid', 
                    'pident',
                    'evalue',
                    'qcovs', 
                    'bitscore']
    
    blasting_results = {}
    opn_blast = read_csv(f'{path}/blast_results.txt', sep='\t', comment='#', header=None, names=blast_header)
    
    for i in tqdm(opn_blast['qseqid'].unique()):
        
        blast_subset = opn_blast[opn_blast["qseqid"] == i]
        blast_subset = blast_subset[blast_subset['pident'] >= identity_filter]
        blast_subset = blast_subset[blast_subset['evalue'] <= evalue_filter]
        blast_subset = blast_subset[blast_subset['qcovs'] >= cov_lim]

        blast_subset = blast_subset.sort_values(by='evalue')
        blast_subset = blast_subset.sort_values(by='pident')[::-1]

        if len(blast_subset['sseqid'].values) == 0:
            continue
            
        subject = blast_subset['sseqid'].values[0]
        blasting_results[consensus[i]] = data_tax[subject]
        
    blasting_results_df = DataFrame(blasting_results).T
    
    return blasting_results_df
    
def processing_data_tax(data_tax):

    data_tax_df = DataFrame(data_tax).T.fillna(0)
    # Add pseudocunt
    # data_tax_df = data_tax_df + 1
    data_tax_df = data_tax_df.assign(m=data_tax_df.mean(axis=1)).sort_values('m').drop('m', axis=1)[np.sort(data_tax_df.columns)]

    return  data_tax_df
    
def get_taxonomy(data_tax, 
                 blasting_results_df, 
                 mearged_pike_out, 
                 tax_level='OTU'):
    
    data_tax = {}
    avs = np.intersect1d(blasting_results_df.index, mearged_pike_out.index)
    count = 1
    OTU_decoder  = {'Seq': [], 'OTU_name' : []}
    
    for av in tqdm(avs):

        if tax_level == 'OTU':
        
            tax = f'OTU_{count}_{blasting_results_df["Species"][av]}'
        else:    
            tax = blasting_results_df[tax_level][av]
        count += 1
        OTU_decoder['Seq'].append(av)
        OTU_decoder['OTU_name'].append(tax)
        if tax == 'nan':
            
            tax = 'No Fungi'

        if tax not in data_tax.keys():
    
            data_tax[tax] = {col: 0 for col in mearged_pike_out.columns} 
        
        for col in mearged_pike_out.columns:
           
            data_tax[tax][col] += mearged_pike_out[col][av]
    
    data_tax_df = processing_data_tax(data_tax)
    
    return data_tax_df, OTU_decoder
    

def filter_data(output, 
                dbpath,
                mearged_pike_out,
                taxonomy_level, 
                identity_filter=95, 
                cov_lim=60, 
                evalue_filter=1e-05):

    # Creating output directory
    try:
        
        os.mkdir(output)
        
    except FileExistsError:
        
        print('The output directory already exists!')
        
    consensus = create_fasta(output, mearged_pike_out, dbpath)
    run_blast(dbpath, output)
    data_tax = decode_tax(dbpath)
    blasting_results_df = parse_blast(output, 
                                      dbpath, 
                                      data_tax, 
                                      consensus, 
                                      identity_filter, 
                                      cov_lim, 
                                      evalue_filter)

  #  mearged_pike_out = filter_av(mearged_pike_out, prevalence, detection, slice)
    data_tax_df, OTU_decoder = get_taxonomy(data_tax, 
                                            blasting_results_df, 
                                            mearged_pike_out,
                                            taxonomy_level)

    data_tax_df = data_tax_df[mearged_pike_out.columns]
    for col in data_tax_df.columns:
        
        data_tax_df[col] = data_tax_df[col] / np.sum(data_tax_df[col].values)
    
    data_tax_df = data_tax_df.fillna(0)[mearged_pike_out.columns]   
    data_tax_df = data_tax_df.assign(m=data_tax_df.mean(axis=1)).sort_values('m').drop('m', axis=1)


    return data_tax_df, data_tax, blasting_results_df, DataFrame(OTU_decoder)

In [6]:
import random
def get_color(obj_dict):
    
    color = ''
    
    while color not in obj_dict.values() and color == '':
        
        color = "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
    
    return color

MERGING ALL TABLES INTO ONE

In [4]:
mearged_otu_table = []

for amplicon_type in ['_100000_reads', '_10000_reads', '_1000_reads', '_150000_reads', '_150_reads',
                      '_200_reads', '_250_reads', '_300000_reads', '_30000_reads', '_3000_reads', '_300_reads',
                      '_400000_reads', '_500000_reads', '_50000_reads', '_5000_reads', '_500_reads',
                      '_600000_reads', '_700000_reads']:
    
    for sample in listdir(f'/mnt/AsusShareI2/RUNS/runs-sonec/pike_single_mode/pike_V3_V4_all/pike_V3_V4_trimmed{amplicon_type}/results/'):
        opn_res = read_csv(f'/mnt/AsusShareI2/RUNS/runs-sonec/pike_single_mode/pike_V3_V4_all/pike_V3_V4_trimmed{amplicon_type}/results/{sample}/results.tsv', sep='\t', index_col=0)
        if 'Count' in opn_res.columns:
            count = 0
            mearged_otu_table.append(DataFrame(data=opn_res['Count'].tolist(), index=opn_res.index, columns=[sample + amplicon_type]))
            with open(f'/mnt/AsusShareI2/RUNS/runs-sonec/pike_single_mode/pike_V3_V4_all/pike_V3_V4_trimmed{amplicon_type}/{sample}.fasta', 'w') as opn_fasta:
                for line in opn_res.index:
                    opn_fasta.write(f'>{count}_{opn_res["Count"][line]}\n{line}\n')
                    count += 1
        else:
            print(f'Столбец "Count" отсутствует в результате для образца {sample + amplicon_type}')

mearged_otu_table = pd.concat(mearged_otu_table, axis=1).fillna(0)
mearged_otu_table = mearged_otu_table.reindex(sorted(mearged_otu_table.columns), axis=1)
mearged_otu_table.to_csv('single_V3_V4_merged_otu_table_all_samples_all_reads.csv')

In [None]:
mearged_otu_table.head(10)

**WORKING WITH SILVA AND BLAST**

**OTU**

In [7]:
output = f'/mnt/AsusShareI2/RUNS/runs-sonec/pike_single_mode/pike_V3_V4_all/pike_V3_V4_trimmed{amplicon_type}/TAXONOMY'
dbpath = '/mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta'
#taxonomy_level = 'Genus'
#taxonomy_level = 'Species'
taxonomy_level = 'OTU'

data_tax_df, data_tax, blasting_results_df, OTU_decoder = filter_data(output, 
                                                                     dbpath,
                                                                     mearged_otu_table,
                                                                     taxonomy_level, 
                                                                     identity_filter=95, 
                                                                     cov_lim=60, 
                                                                     evalue_filter=1e-05)
data_tax_df.to_csv('single_V3_V4_data_tax_df_OTU.csv', sep='\t')

The output directory already exists!


Building a new DB, current time: 08/06/2024 15:54:33
New DB name:   /mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta
New DB title:  /mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 510508 sequences in 11.6544 seconds.




510508it [00:05, 90993.34it/s]
100%|████████████████████████████████████████| 330/330 [00:00<00:00, 979.02it/s]
100%|███████████████████████████████████████| 315/315 [00:00<00:00, 2432.27it/s]


In [None]:
data_tax_df.head(30)

In [9]:
data_tax_no0_rows = (data_tax_df !=0).sum()
result = pd.DataFrame(data_tax_no0_rows).transpose()
result.head(10)
#result.index = ['OTU_count_nonzero']
#result.to_csv('OTU_count_nonzero.csv',sep='\t')

Unnamed: 0,V3_V4_1_100000_reads,V3_V4_1_10000_reads,V3_V4_1_1000_reads,V3_V4_1_100_reads,V3_V4_1_150000_reads,V3_V4_1_150_reads,V3_V4_1_200_reads,V3_V4_1_250_reads,V3_V4_1_300000_reads,V3_V4_1_30000_reads,...,V3_V4_R3_30000_reads,V3_V4_R3_3000_reads,V3_V4_R3_300_reads,V3_V4_R3_400000_reads,V3_V4_R3_500000_reads,V3_V4_R3_50000_reads,V3_V4_R3_5000_reads,V3_V4_R3_500_reads,V3_V4_R3_600000_reads,V3_V4_R3_700000_reads
0,11,11,11,1,10,2,4,8,11,12,...,17,8,8,24,25,22,8,8,23,25


In [10]:
result_long = result.melt(var_name='column', value_name='OTU_count_nonzero')
result_long

Unnamed: 0,column,OTU_count_nonzero
0,V3_V4_1_100000_reads,11
1,V3_V4_1_10000_reads,11
2,V3_V4_1_1000_reads,11
3,V3_V4_1_100_reads,1
4,V3_V4_1_150000_reads,10
...,...,...
112,V3_V4_R3_50000_reads,22
113,V3_V4_R3_5000_reads,8
114,V3_V4_R3_500_reads,8
115,V3_V4_R3_600000_reads,23


In [11]:
result_long['sample_name'] = result_long['column'].str.extract(r'(V3_V4_(?:R)?\d+)_\d+_reads')
result_long
#result_long.to_csv('result_long_test.csv',sep='\t')

Unnamed: 0,column,OTU_count_nonzero,sample_name
0,V3_V4_1_100000_reads,11,V3_V4_1
1,V3_V4_1_10000_reads,11,V3_V4_1
2,V3_V4_1_1000_reads,11,V3_V4_1
3,V3_V4_1_100_reads,1,V3_V4_1
4,V3_V4_1_150000_reads,10,V3_V4_1
...,...,...,...
112,V3_V4_R3_50000_reads,22,V3_V4_R3
113,V3_V4_R3_5000_reads,8,V3_V4_R3
114,V3_V4_R3_500_reads,8,V3_V4_R3
115,V3_V4_R3_600000_reads,23,V3_V4_R3


In [12]:
result_long['number_of_reads'] = result_long['column'].str.extract(r'V3_V4_(?:R)?\d+_(\d+)_reads')
result_long.drop('column', axis=1, inplace=True)
result_long.fillna(0)
result_long['number_of_reads'] = result_long['number_of_reads'].astype('int')
result_long

Unnamed: 0,OTU_count_nonzero,sample_name,number_of_reads
0,11,V3_V4_1,100000
1,11,V3_V4_1,10000
2,11,V3_V4_1,1000
3,1,V3_V4_1,100
4,10,V3_V4_1,150000
...,...,...,...
112,22,V3_V4_R3,50000
113,8,V3_V4_R3,5000
114,8,V3_V4_R3,500
115,23,V3_V4_R3,600000


In [13]:
result_long=result_long.sort_values('number_of_reads',ascending=True)

In [15]:
result_long
result_long.to_csv('V3_V4_OTU_count_long_table.csv', sep='\t')

**PLOTTING FROM RESULT_LONG DF( OPTIMAL)**

In [None]:
fig, ax = plt.subplots()
for sample in result_long['sample_name'].unique():
    subset = result_long[result_long['sample_name'] == sample]
    ax.plot(subset['number_of_reads'], subset['OTU_count_nonzero'], marker='o', linestyle='-', label=sample)

ax.set_xlabel('Number of reads')
ax.set_ylabel('Count')
ax.set_title('OTU')
ax.set_ylim(-1, 35)
ax.set_xscale('log')
#ax.xaxis.set_ticks(result_melted["number_of_reads"])
ax.legend()
plt.show()
#plt.savefig('OTU_V3_V4.png')

In [None]:
samples = pd.Series([col.rsplit('_', 2)[0] for col in result.columns]).drop_duplicates().tolist()
samples

In [None]:
reads = pd.Series([col.split('_')[-2] for col in result.columns]).drop_duplicates().tolist()
reads.sort()
reads

TESTING RESHAPING DF ON DUMMY DATA

In [None]:
# Assuming your initial dataframe is named df
data = {
    'V3_V4_1_100_reads': [10],
    'V3_V4_2_100_reads': [15],
    'V3_V4_3_100_reads': [20],
    'V3_V4_1_500_reads': [30],
    'V3_V4_2_500_reads': [30],
    'V3_V4_3_500_reads': [35]
}

df = pd.DataFrame(data)

# Melt the dataframe to go from wide to long format
df_long = df.melt(var_name='column', value_name='OTU_count_nonzero')

In [None]:
df

In [None]:
df_long

In [None]:
# Extract sample names and number of reads
df_long['sample_name'] = df_long['column'].str.extract(r'(V3_V4_\d+)_\d+_reads')
df_long

In [None]:
df_long['number_of_reads'] = df_long['column'].str.extract(r'V3_V4_\d+_(\d+)_reads')
df_long

In [None]:
# Drop the original column names as they are no longer needed
df_long.drop('column', axis=1, inplace=True)
df_long

In [None]:
# Pivot the table to get the desired format
df_pivot = df_long.pivot(index='number_of_reads', columns='sample_name', values='OTU_count_nonzero')
df_pivot

In [None]:
# Reset the index to get number_of_reads as a column
df_final = df_pivot.reset_index()
df_final

In [None]:
# Optionally, sort by number_of_reads if needed
df_final.columns

In [None]:
df_final.columns.name = None
# Print the final dataframe
print(df_final)

**GENUS**

In [16]:
output = f'/mnt/AsusShareI2/RUNS/runs-sonec/pike_all/pike_V3_V4_all/pike_V3_V4_trimmed{amplicon_type}/TAXONOMY'
dbpath = '/mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta'
taxonomy_level = 'Genus'
#taxonomy_level = 'Species'
#taxonomy_level = 'OTU'

data_tax_df, data_tax, blasting_results_df, OTU_decoder = filter_data(output, 
                                                                     dbpath,
                                                                     mearged_otu_table,
                                                                     taxonomy_level, 
                                                                     identity_filter=95, 
                                                                     cov_lim=60, 
                                                                     evalue_filter=1e-05)
data_tax_df.to_csv('V3_V4_data_tax_df_genus.csv', sep='\t')

The output directory already exists!


Building a new DB, current time: 05/16/2024 13:17:15
New DB name:   /mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta
New DB title:  /mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 510508 sequences in 12.177 seconds.




510508it [00:07, 71395.23it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 340/340 [00:00<00:00, 847.51it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 325/325 [00:00<00:00, 1968.58it/s]


In [17]:
data_tax_no0_rows = (data_tax_df !=0).sum()
result = pd.DataFrame(data_tax_no0_rows).transpose()
result.index = ['Genus_count_nonzero']
#result.to_csv('genus_count_nonzero.csv',sep='\t')

In [18]:
result_long = result.melt(var_name='column', value_name='Genus_count_nonzero')
result_long['sample_name'] = result_long['column'].str.extract(r'(V3_V4_(?:R)?\d+)_\d+_reads')
result_long['number_of_reads'] = result_long['column'].str.extract(r'V3_V4_(?:R)?\d+_(\d+)_reads')
result_long.drop('column', axis=1, inplace=True)
result_long.fillna(0)
result_long['number_of_reads'] = result_long['number_of_reads'].astype('int')
result_long=result_long.sort_values('number_of_reads',ascending=True)
result_long
result_long.to_csv('V3_V4_genus_count_long_table.csv', sep='\t')

In [None]:
fig, ax = plt.subplots()
for sample in result_long['sample_name'].unique():
    subset = result_long[result_long['sample_name'] == sample]
    ax.plot(subset['number_of_reads'], subset['Genus_count_nonzero'], marker='o', linestyle='-', label=sample)

ax.set_xlabel('Number of reads')
ax.set_ylabel('Count')
ax.set_title('GENUS')
ax.set_ylim(-1, 35)
ax.set_xscale('log')
#ax.xaxis.set_ticks(result_melted["number_of_reads"])
ax.legend()
plt.show()
#plt.savefig('Genus_V3_V4.png')

SPECIES

In [19]:
output = f'/mnt/AsusShareI2/RUNS/runs-sonec/pike_all/pike_V3_V4_all/pike_V3_V4_trimmed{amplicon_type}/TAXONOMY'
dbpath = '/mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta'
#taxonomy_level = 'Genus'
taxonomy_level = 'Species'
#taxonomy_level = 'OTU'

data_tax_df, data_tax, blasting_results_df, OTU_decoder = filter_data(output, 
                                                                     dbpath,
                                                                     mearged_otu_table,
                                                                     taxonomy_level, 
                                                                     identity_filter=95, 
                                                                     cov_lim=60, 
                                                                     evalue_filter=1e-05)
data_tax_df.to_csv('V3_V4_data_tax_df_species.csv', sep='\t')

The output directory already exists!


Building a new DB, current time: 05/16/2024 13:20:24
New DB name:   /mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta
New DB title:  /mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /mnt/AsusShareI2/RUNS/runs-sonec/SILVA_138.1_SSURef_NR99_tax_silva.fasta
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 510508 sequences in 11.8562 seconds.




510508it [00:07, 71995.63it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 340/340 [00:00<00:00, 846.61it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 325/325 [00:00<00:00, 1959.94it/s]


In [20]:
data_tax_no0_rows = (data_tax_df !=0).sum()
result = pd.DataFrame(data_tax_no0_rows).transpose()
result.index = ['Species_count_nonzero']
#result.to_csv('species_count_nonzero.csv',sep='\t')

In [21]:
result_long = result.melt(var_name='column', value_name='Species_count_nonzero')
result_long['sample_name'] = result_long['column'].str.extract(r'(V3_V4_(?:R)?\d+)_\d+_reads')
result_long['number_of_reads'] = result_long['column'].str.extract(r'V3_V4_(?:R)?\d+_(\d+)_reads')
result_long.drop('column', axis=1, inplace=True)
result_long.fillna(0)
result_long['number_of_reads'] = result_long['number_of_reads'].astype('int')
result_long=result_long.sort_values('number_of_reads',ascending=True)
result_long
result_long.to_csv('V3_V4_species_count_long_table.csv', sep='\t')

In [None]:
fig, ax = plt.subplots()
for sample in result_long['sample_name'].unique():
    subset = result_long[result_long['sample_name'] == sample]
    ax.plot(subset['number_of_reads'], subset['Species_count_nonzero'], marker='o', linestyle='-', label=sample)

ax.set_xlabel('Number of reads')
ax.set_ylabel('Count')
ax.set_title('Species')
ax.set_ylim(-1, 35)
ax.set_xscale('log')
#ax.xaxis.set_ticks(result_melted["number_of_reads"])
ax.legend()
plt.show()
plt.savefig('Species_V3_V4.png')

PLOTS

In [None]:
Color_collection = {}

for i in data_tax_df.index:
    
    Color_collection[i] = get_color(Color_collection)

In [None]:
import plotly.express as px
import plotly.subplots as sp

fig = px.bar(data_tax_df.T, 
             x=data_tax_df.columns, 
             y=data_tax_df.index,
             width=1500, 
             height=900, 
            # color=data_tax_df.index,
             labels={'value': 'Relative abundance', 'index':'Samples'}, 
             template='simple_white',
             color_discrete_map=Color_collection)
fig.update_layout(yaxis_range=[0, 1], legend_title_text='Taxon', legend_title_side='top center')
fig.update_traces(marker_line_width=1.1, marker_line_color='#202020', opacity=0.8)
fig.update_yaxes(ticksuffix = "  ")
fig.update_xaxes(range=[-1, len(mearged_otu_table.T)+0.2], autorangeoptions_clipmax=len(data_tax_df.T))

#fig.update_layout(showlegend=False)
#os.mkdir("VIZ")
fig.write_image(f"VIZ/16S_{taxonomy_level}.pdf")
fig.write_image(f"VIZ/16S_{taxonomy_level}.png", scale=5)
fig.show()