In [1]:
### SETTING UP THE DATAFRAME ###

In [295]:
#Importing relevant modules: 
# * dataframe - pandas 
# * graphs - matplotlib 
# * tools - numpy, random, fnmatch, os
# * fasta to dataframe conversion - SeqIO

import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import os
import pandas as pd
import random
import collections
from itertools import product
from Bio import SeqIO

In [3]:
work_dir = '/home/gamran/dirtreetest/'

In [4]:
#Functions that generate main work directories and subdirectories for:
# * data (all fasta files)
# * blast (all blast output files)
# * analysis (all dataframe summary and graph files)
# When given a top directory path

def create_main_directories(top_dir):
    """Given a top work directory, creates 3 main subdirectories - data, blast and analysis"""
    main_directory_list = ['data/', 'blast/', 'analysis/']
    for new_direct in main_directory_list:
        if not os.path.exists(top_dir + new_direct):
            os.makedirs(top_dir + new_direct)

def create_data_subdirectories(top_dir):
    data_subdirectory_list = ['basecalled/',
                        'rghityes/',
                        'rghitno/',
                        'rghitsamples/']
    data_subsubdirectory_list = ['ncbihityes/',
                                 'ncbihitno/']
    for new_subdirect in data_subdirectory_list:
        if not os.path.exists(top_dir + 'data/' + new_subdirect):
            os.makedirs(top_dir + 'data/' + new_subdirect)
    for new_subsubdirect in data_subsubdirectory_list:
        if not os.path.exists(top_dir + 'data/rghitno/' + new_subsubdirect):
            os.makedirs(top_dir + 'data/rghitno/' + new_subsubdirect)

def create_blast_subdirectories(top_dir):
    blast_subdirectory_list = ['rgbesthit/',
                               'ncbibesthit/']
    for new_subdirect in blast_subdirectory_list:
        if not os.path.exists(top_dir + 'blast/' + new_subdirect):
            os.makedirs(top_dir + 'blast/' + new_subdirect)

def create_analysis_subdirectories(top_dir):
    analysis_subdirectory_list = ['graphs/',
                               'summaries/']
    for new_subdirect in analysis_subdirectory_list:
        if not os.path.exists(top_dir + 'analysis/' + new_subdirect):
            os.makedirs(top_dir + 'analysis/' + new_subdirect)

In [None]:
#Create a function that moves all basecalled fasta files to the ./basecalled folder, given input

In [5]:
#Apply previous functions to generate data, blast and analysis directories and subdirectories:
create_main_directories(work_dir)
create_data_subdirectories(work_dir)
create_blast_subdirectories(work_dir)
create_analysis_subdirectories(work_dir)

In [89]:
#Generates a list of all the fasta files present in data/basecalled
os.chdir(work_dir + 'data/')
#os.curdir
#os.getcwd()

basecalled_file_names = [x for x in list(os.walk('./basecalled', topdown=False))]

In [88]:
#Reduces basecalled_file_names to a flattened list, extracts all basecalled fasta file names
def flatten(l):
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, (str, bytes)):
            yield from flatten(el)
        else:
            yield el

basecalled_file_names_flattened = list(flatten(basecalled_file_names))
basecalled_fasta_file_names = [x for x in basecalled_file_names_flattened if '.fa' in x]
basecalled_fasta_file_names.sort()


['1212_1D_barcoding_Wagga_BC01.fa',
 '1212_1D_barcoding_Wagga_BC02.fa',
 '1212_1D_barcoding_Wagga_BC03.fa',
 '1212_1D_barcoding_Wagga_BC04.fa',
 '1212_1D_barcoding_Wagga_BC05.fa',
 '1212_1D_barcoding_Wagga_BC06.fa',
 '1212_1D_barcoding_Wagga_BC07.fa',
 '1212_1D_barcoding_Wagga_BC08.fa',
 '1212_1D_barcoding_Wagga_BC09.fa',
 '1212_1D_barcoding_Wagga_BC10.fa',
 '1212_1D_barcoding_Wagga_BC11.fa',
 '1212_1D_barcoding_Wagga_BC12.fa',
 '1212_1D_barcoding_Wagga_BC13.fa',
 '1212_1D_barcoding_Wagga_BC14.fa',
 '1212_1D_barcoding_Wagga_BC15.fa',
 '1212_1D_barcoding_Wagga_BC16.fa',
 '1212_1D_barcoding_Wagga_BC17.fa',
 '1212_1D_barcoding_Wagga_BC19.fa',
 '1212_1D_barcoding_Wagga_BC20.fa',
 '1212_1D_barcoding_Wagga_BC21.fa',
 '1212_1D_barcoding_Wagga_BC22.fa',
 '1212_1D_barcoding_Wagga_BC23.fa',
 '1212_1D_barcoding_Wagga_BC24.fa',
 '1212_1D_barcoding_Wagga_BC25.fa',
 '1212_1D_barcoding_Wagga_BC26.fa',
 '1212_1D_barcoding_Wagga_BC27.fa',
 '1212_1D_barcoding_Wagga_BC28.fa',
 '1212_1D_barcoding_Wagga_BC

In [92]:
#Make a list of all available barcodes, based on fasta file names
basecalled_fasta_barcodes = []
for bcs in range(0, len(basecalled_fasta_file_names)):
    basecalled_fasta_barcodes.append(basecalled_fasta_file_names[bcs][-7:-3]) #Convert to a search parameter?
#print(basecalled_fasta_file_names[1][-7:-3])  
#print(basecalled_fasta_barcodes)

In [197]:
#Specifies all good barcodes
all_barcodes = basecalled_fasta_barcodes
good_barcodes = ['BC01', 'BC02', 'BC03', 'BC04', 'BC05', 'BC06']
bad_barcodes = sorted(list(set(all_barcodes) - set(good_barcodes)))
#bad_barcodes_BC00 = [x for x in bad_barcodes if 'BC' in x]
#bad_barcodes_NB00 = [x for x in bad_barcodes if 'NB' in x]


In [122]:
#Extracts the ID from the basecalled file names (everything before BC/NB??.fa)
basecalled_fasta_file_id = list(set([x[:-8] for x in basecalled_fasta_file_names]))[0]

In [210]:
#Create text file to summarise read_id, barcode and length for all basecalled reads, save in analysis/summaries/
analysis_summaries_path = work_dir + 'analysis/summaries/'
complete_path_and_name = os.path.join(analysis_summaries_path, basecalled_fasta_file_id + ".lengths.txt")
lengths_txt = open(complete_path_and_name, "w")
print("Read_id,Barcode,Length,Quality", file=lengths_txt)

os.chdir(work_dir + 'data/basecalled/')
for fa_file in basecalled_fasta_file_names: #file_names had the list of all the barcode fasta file names
    for seq in SeqIO.parse(open(fa_file), 'fasta'):
        if fa_file[-7:-3] in good_barcodes:
            print(seq.id + "," + fa_file[-7:-3] + "," + str(len(seq)) + "," + 'Good', file=lengths_txt)
        elif fa_file[-7:-3] in bad_barcodes:
            print(seq.id + "," + 'NB00' + "," + str(len(seq)) + "," + 'Bad', file=lengths_txt)
        else:
            continue
lengths_txt.close()

In [211]:
basecalled_df = pd.read_csv(complete_path_and_name)


In [214]:
basecalled_df.tail()

Unnamed: 0,Read_id,Barcode,Length,Quality
533028,312cc732-cb9d-4af2-972b-fd853dad0b3d_Basecall_...,NB00,750,Bad
533029,3d6dcaa5-6dff-4b84-bf68-8d277375b0fe_Basecall_...,NB00,1804,Bad
533030,c5df690f-7603-41fe-8f29-84d3b49c2066_Basecall_...,NB00,1388,Bad
533031,05261b66-e642-4058-a63f-9a6e60e4d752_Basecall_...,NB00,6469,Bad
533032,cb1a108f-a791-4842-a77c-c30ca3555b50_Basecall_...,NB00,1515,Bad


In [None]:
### DATAFRAME ANALYSIS - TABLE OF SUMMARY DATA ###

In [246]:
### Shows number of reads, sum, max, mean, median of read lengths per barcode, all bad barcodes sorted to NB00 ###
basecalled_df_pivot = basecalled_df.pivot_table(values='Length', 
                                                index='Barcode', 
                                                aggfunc=[len, np.sum, np.max, np.mean, np.median],
                                                fill_value=0, 
                                                margins=True)
basecalled_df_pivot.index.name = None

for clmns in ['len', 'amax', 'median']:
    basecalled_df_pivot[clmns] = basecalled_df_pivot[clmns].apply(lambda x: int(x))
basecalled_df_pivot['sum'] = basecalled_df_pivot['sum'].apply(lambda x: "{0:.3f}".format(int(x)/1000000))
basecalled_df_pivot['mean'] = basecalled_df_pivot['mean'].apply(lambda x: "{0:.2f}".format(x))
basecalled_df_pivot_formal = basecalled_df_pivot.copy() #titles have whitespace, make fancier
basecalled_df_pivot_formal.rename(columns={'len': 'Number of Reads', 
                                    'sum': 'Total Length (Mbp)',
                                    'amax': 'Max Length (bp)',
                                    'mean': 'Mean Length (bp)',
                                    'median': 'Median Length (bp)'}, inplace=True)

basecalled_df_pivot_formal

In [None]:
### DATAFRAME ANALYSIS - HISTOGRAMS OF SUMMARY DATA ###

In [268]:
#Generates a histogram showing the total read length distribution for all basecalled reads
plt.figure(figsize=(10,5))
basecalled_df.Length.hist(bins=30)

plt.title('Basecalled Reads - Total Read Length Distribution', y=1.03, fontsize='x-large', fontweight='bold')

plt.xlabel('Read Length', fontsize=16)
plt.xticks(np.arange(0, basecalled_df.Length.max() + 1, 2000))
plt.xlim([-1000, basecalled_df.Length.max() + 1000])

plt.ylabel('Read Count', fontsize=16)

for idx, clmn_name in enumerate(list(basecalled_df_pivot_formal.columns)):
    plt.annotate(clmn_name + ' = ' + str(basecalled_df_pivot_formal[clmn_name]['All']), 
                 xy=(1, 1), 
                 xycoords='axes fraction', 
                 fontsize=16, 
                 fontweight='normal',
                 xytext=(-20, -30 - 30*idx), 
                 textcoords='offset points', 
                 ha='right', 
                 va='top')
    
plt.savefig(work_dir + 'analysis/graphs/graph1.png', bbox_inches='tight')

In [359]:
#Generates a histogram showing the total read length distribution for all basecalled reads per barcode

#Assesses number of barcodes, generates necessary number of plots (even) and relevant indexing system
basecalled_barcodes = list(basecalled_df['Barcode'].unique())

if len(list(basecalled_df['Barcode'].unique())) % 2 == 0:
    no_of_subplots = len(basecalled_barcodes)
else:
    no_of_subplots = len(basecalled_barcodes) + 1

#Always 2 columns, bc-count/2 rows 
no_of_subplots_pair = [int(no_of_subplots/2), 2]

#Produce pairs of indices correlating to the coordinates of the subplots
subplot_coordinates = list(product(range(no_of_subplots_pair[0]), range(no_of_subplots_pair[1])))
subplot_coordinates_list = [list(l) for l in subplot_coordinates]
subplot_coordinates_list_rows = [i[0] for i in subplot_coordinates_list]
subplot_coordinates_list_columns = [i[1] for i in subplot_coordinates_list]

fig, ax = plt.subplots(no_of_subplots_pair[0], no_of_subplots_pair[1], figsize=(15,10))

xmax = int(basecalled_df.Length.max())

grouped = basecalled_df.groupby('Barcode')  # CHECK THIS FOR SCALABILITY, NOT ENTIRELY SURE WHAT THIS DOES
max_count_list = []
for key in grouped.groups.keys():
    max_count_list.append(np.histogram(grouped.Length.get_group(key), 50)[0].max())
max_count_list.sort()

def applyGroupHistograms(ax_ind1, ax_ind2, bcs):
    ax[ax_ind1, ax_ind2].hist(basecalled_df.groupby('Barcode')['Length'].get_group(bcs), 
                              bins=range(0, xmax, int(xmax/60)), 
                              color='green', 
                              alpha=0.8)
    ax[ax_ind1, ax_ind2].set_title(bcs)
    ax[ax_ind1, ax_ind2].set_xlabel('Read Length')
    ax[ax_ind1, ax_ind2].set_ylabel('Read Count');
    ax[ax_ind1, ax_ind2].set_xlim([-1000, basecalled_df['Length'].max() + 1000])
    ax[ax_ind1, ax_ind2].set_xticks(np.arange(0, xmax + 1, 2000))
    ax[ax_ind1, ax_ind2].set_ylim(0, max_count_list[-1]*(5/4))
    ax[ax_ind1, ax_ind2].grid(True, which='Major')
    
    for idx, clmn_name in enumerate(['Number of Reads', 'Total Length (Mbp)', 'Median Length (bp)']):
        ax[ax_ind1, ax_ind2].annotate(clmn_name + ' = ' + str(basecalled_df_pivot_formal[clmn_name][bcs]), 
                     xy=(1, 1), 
                     xycoords='axes fraction', 
                     fontsize=16, 
                     fontweight='normal',
                     xytext=(-20, -25 - 30*idx), 
                     textcoords='offset points', 
                     ha='right', 
                     va='top')

for ax_ind1, ax_ind2, bcs, in zip(subplot_coordinates_list_rows, subplot_coordinates_list_columns, bc_list):
    applyGroupHistograms(ax_ind1, ax_ind2, bcs)    

if len(list(basecalled_df['Barcode'].unique())) != 0:
    plt.delaxes(ax[subplot_coordinates_list_rows[-1], subplot_coordinates_list_columns[-1]])

plt.suptitle('Basecalled Reads - Read Length Distribution By Barcode', 
             y=1.03, 
             fontsize='x-large', 
             fontweight='bold')
plt.tight_layout()

plt.savefig(work_dir + 'analysis/graphs/graph2.png', bbox_inches='tight')

In [None]:
#Create a function that moves all blast files to the blast/basecalled folder, given input

In [374]:
os.chdir(work_dir + 'blast/')
rgblast_file_names = [x for x in list(os.walk('./rgbesthit', topdown=False))]
rgblast_file_names_flattened = list(flatten(rgblast_file_names))
rgblast_besthit_file_names = [x for x in rgblast_file_names_flattened if 'blast.besthit' in x]
rgblast_besthit_file_names.sort()

In [377]:
rgblast_besthit_file_names

['1212_1D_barcoding_Wagga_BC00.fa.WW_19121016.blast.besthit',
 '1212_1D_barcoding_Wagga_BC01.fa.WW_19121016.blast.besthit',
 '1212_1D_barcoding_Wagga_BC02.fa.WW_19121016.blast.besthit',
 '1212_1D_barcoding_Wagga_BC03.fa.WW_19121016.blast.besthit',
 '1212_1D_barcoding_Wagga_BC04.fa.WW_19121016.blast.besthit',
 '1212_1D_barcoding_Wagga_BC05.fa.WW_19121016.blast.besthit',
 '1212_1D_barcoding_Wagga_BC06.fa.WW_19121016.blast.besthit',
 '1212_1D_barcoding_Wagga_NB00.fa.WW_19121016.blast.besthit']

In [384]:
os.chdir(work_dir + 'blast/rgbesthit')
rgblast_dfs_dict = {}
for besthit_file in rgblast_besthit_file_names:
    if '.blast.besthit' in besthit_file:
        key = '' #Safety 
        key = besthit_file.split('.')[0][-4:]
        rgblast_dfs_dict[key] = pd.read_table(besthit_file, delim_whitespace=True, 
                                           names=['qseqid', 'sseqid', 'evalue', 'bitscore', 'length', 'pident', 
                                           'nident', 'species', 'barcode'])
for bcs in rgblast_dfs_dict.keys():
    rgblast_dfs_dict[bcs]['barcode'] = bcs

SyntaxError: invalid syntax (<ipython-input-384-f7dad4ac8e64>, line 9)

In [392]:
rgblast_dfs_dict['BC02']

Unnamed: 0,qseqid,sseqid,evalue,bitscore,length,pident,nident,species,barcode
0,00005d56-7077-49cf-9807-46852eb36a21_Basecall_...,TGACv1_scaffold_327975_4BS,0.000000e+00,1338.0,1270,87.17,1107,,BC02
1,00005d6b-fc54-4070-a72e-04e6e56226ea_Basecall_...,TGACv1_scaffold_030242_1BL,0.000000e+00,4728.0,4630,86.63,4011,,BC02
2,000099d0-ab1c-4010-97d9-cbb963467ac4_Basecall_...,TGACv1_scaffold_606415_7DL,2.000000e-126,460.0,571,83.19,475,,BC02
3,000146cd-4ded-471e-86f0-2390f5958b9d_Basecall_...,TGACv1_scaffold_031052_1BL,4.000000e-114,420.0,573,81.68,468,,BC02
4,0001aac0-6697-4785-b990-ae42706c8c26_Basecall_...,TGACv1_scaffold_602887_7DL,0.000000e+00,928.0,954,85.32,814,,BC02
5,0002a860-bbc7-4bdb-85e8-978aef493941_Basecall_...,TGACv1_scaffold_234012_3B,0.000000e+00,1530.0,2314,80.25,1857,,BC02
6,0003278c-ea34-496e-9a06-fd46e9d820f2_Basecall_...,TGACv1_scaffold_499976_6BL,0.000000e+00,1203.0,1027,89.09,915,,BC02
7,0003b6df-490f-4bc4-a0d4-fb6b21a3b84f_Basecall_...,TGACv1_scaffold_227006_3B,3.000000e-180,640.0,922,81.24,749,,BC02
8,00048308-a60c-4f4b-a6b7-cd93eb5c3ed1_Basecall_...,TGACv1_scaffold_527989_6DL,0.000000e+00,2401.0,2237,87.66,1961,,BC02
9,0006e104-a5ac-4b87-91b9-ed5e9b01af45_Basecall_...,TGACv1_scaffold_609188_7DL,5.000000e-168,599.0,735,82.99,610,,BC02
