# DK_0911_post_allele_analysis_v02

Based on original code by Benjamin Schwessinger.

- Inputs: output from `DK_0911_defining_alleles_v02` & primary+haplotig (ph) protein/gene/cds .*fasta* files from `DK_0911_generate_fasta_files_from_gff3`.
- Programs: **MUSCLE**, **PAML**
- Purpose: generate and save a DataFrame containing dN/dS information (number of nonsynonymous substitutions per non-synonymous site to the number of synonymous substitutions per synonymous site), as well as Hamming & Levenshtein distances (measures of % identity). Also provides visualisations of some of this data.

#### Overview
1. Reads in the large allele DataFrames generated in `DK_0911_defining_alleles_v02` (i.e. proteinortho hits OR best blast hit) - see description header cell in that notebook for more information on which alleles are included in that DataFrame.
2. Filters the allele DataFrames based on %ID and %QCov (this can be set to filter only BLAST-identified alleles or both BLAST- and proteinortho-identified alleles) so that distance information is not calculated on an unnecessarily large number of alleles.
3. Calculates distance & dN/dS information, and saves this to an output file so that it does not have to be re-calculated (if for whatever reason, the inputs change so that dN/dS or distance information should change, this output file (`DK_0911_v0x_analysed_alleles.df`) should be deleted so that it can be re-generated.
4. Plots graphs of allele-type distribution (pie chart) and allele-type Levenshtein distances (measures of similarity) for different levels of allele-filtering (QCov/TCov/%ID/Levenshtein similarity).

NB:
- dN/dS information is currently not utilised in this script.

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import os
import shutil
from Bio import SeqIO
from Bio import AlignIO
import distance
import editdistance
import math
import subprocess
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import collections
import pybedtools
from sklearn.externals.joblib import Parallel, delayed
import itertools as it

In [3]:
#define variables here
GENOME_VERSION = 'v04'

BASE_PATH = '/home/benjamin/genome_assembly/Warrior/allele_analysis/%s/' % GENOME_VERSION

YN00_PATH = '/home/gamran/genome_analysis/Warrior/Richard/output/post_analysis/yn00.ctl'
BASE_OUT_PATH = os.path.join(BASE_PATH, 'post_analysis/')
ALLELE_PATH = os.path.join(BASE_PATH, 'allele_analysis/alleles_proteinortho_graph516/')
UNFILTERED_DF_PATH = os.path.join(BASE_PATH, \
    'allele_analysis/DK_0911_%s_p_ctg.DK_0911_%s_h_ctg.0.001.blastp.outfmt6.allele_analysis' \
                                  % (GENOME_VERSION, GENOME_VERSION))
GENOME_PATH = '/home/benjamin/genome_assembly/Warrior/genome_%s/' % GENOME_VERSION
FIGURE_PATH = os.path.join(BASE_OUT_PATH, 'figures')

GENOME = 'DK_0911_%s' % GENOME_VERSION
P_GENOME = GENOME + '_p_ctg'
H_GENOME = GENOME + '_h_ctg'

threads = 8

# Base filtering so that distance calculations are not performed on all allele pairs.
# Distance calculations will only be performed on allele pairs above the defined cutoffs.
# Note that proteinortho alleles will not be affected (this can be changed in the filterAlleleDf function).
BASE_QCOV_CUTOFF = 0
BASE_TCOV_CUTOFF = 0
BASE_PCTID_CUTOFF = 0

P_PROTEINS_FASTA = os.path.join(GENOME_PATH, P_GENOME + '.protein.fa')

PAML_PATH = os.path.join(BASE_OUT_PATH, 'paml')
if not os.path.exists(BASE_OUT_PATH):
    os.mkdir(BASE_OUT_PATH)
if not os.path.exists(FIGURE_PATH):
    os.mkdir(FIGURE_PATH)
if not os.path.exists(PAML_PATH):
    os.mkdir(PAML_PATH)
    shutil.copy2(YN00_PATH, PAML_PATH)

In [4]:
COV_PATH = '/home/benjamin/genome_assembly/Warrior/COV'
homo_bed_fh = os.path.join(COV_PATH, 'DK0911_v04_ph_ctg.ph_p_homo_cov.bed')
anno_gff_p_fh = os.path.join(GENOME_PATH, 'DK_0911_v04_p_ctg.anno.gff3')

In [5]:
PH_PROTEIN_FASTA = os.path.join(GENOME_PATH, GENOME + '_ph_ctg.protein.fa')
PH_GENE_FASTA = os.path.join(GENOME_PATH, GENOME + '_ph_ctg.gene.fa')
PH_CDS_FASTA = os.path.join(GENOME_PATH, GENOME + '_ph_ctg.cds.fa')

In [6]:
def allgffTogenegff(gff_fh, write_out=True):
    '''Converts at complete gff to a gene gff only and writtes it out.'''
    gene_gff = pd.read_csv(gff_fh, sep='\t', header=None)
    gene_gff = gene_gff[gene_gff[2] == 'gene']
    gene_gff.reset_index(drop=True, inplace=True)
    gene_gff.to_csv(gff_fh.replace('anno', 'gene'), sep='\t', header=False, index=False)
    return gene_gff

In [7]:
def col_8_id(x):
    '''Function that pulls out the ID from the 9th column of a df.'''
    pattern = r'ID=([a-zA-Z0-9_.]*);'
    regex = re.compile(pattern)  
    m = regex.search(x)
    match = m.groups()[0].replace('TU', 'model')
    if match.startswith('cds.'):
        match = match[4:]
    if 'exon' in match:
        _list = match.split('.')
        match = '.'.join(_list[:-1])
    return match

In [8]:
def assignMatchType(allele_source, overlap, no_overlap):
    if not allele_source == np.nan:
        return allele_source
    
    s = allele_source + '_'
    
    if overlap:
        s += 'overlap'
    elif no_overlap:
        s += 'no_overlap'
    else: # different_pcontig
        s += 'unlinked'
    return s

def reduceGroups(g):
    '''returns the best hit based on e-value and BitScore per group'''
    if len(g) == 1:
        return g
    tmp_g = g[g['e-value'] == g['e-value'].min()]
    if len(tmp_g) == 1:
        return tmp_g
    return tmp_g[tmp_g['BitScore'] == tmp_g['BitScore'].max()]

def filterAlleleDf(alleleDf, qCov, tCov, pctId, levSim, leavePO=False):
    if leavePO:
        no_PO_df = alleleDf[(alleleDf['allele_source'] == 'h_rBLAST') | (alleleDf['allele_source'] == 'BLAST')]
        PO_df = alleleDf[alleleDf['allele_source'] == 'PO']

        filtered_no_PO_df = filterAlleleDf(no_PO_df, qCov, tCov, pctId, levSim)
        return filtered_no_PO_df.append(PO_df, ignore_index=True)
    
    if qCov:
        alleleDf = alleleDf[alleleDf['QCov'] > qCov]
    if tCov:
        alleleDf = alleleDf[alleleDf['TCov'] > tCov]
    if pctId:
        alleleDf = alleleDf[alleleDf['PctID'] > pctId]
    if levSim:
        levDist = (100-levSim)/100.0
        alleleDf = alleleDf[alleleDf['protein_levenshtein'] < levDist]

    return alleleDf

In [9]:
def geneUnphased(Gene_gff_fh, Homo_cov_bed_fh ):
    """
    Returns a list of all genes that are unphased.
    
    Input: * Fh for annotation gff file
           * Fh for Homo_cov_bed_fh
    Output: A set of gene IDs that are unphases
    """
    geneGff_bed = pybedtools.BedTool(Gene_gff_fh)
    homo_p_bed = pybedtools.BedTool(Homo_cov_bed_fh)
    gene_ids_ph_p_homo = []
    for x in geneGff_bed.intersect(homo_p_bed, f=0.4):
        y = col_8_id(x[8])
        gene_ids_ph_p_homo.append(y)
    gene_ids_ph_p_homo = set(gene_ids_ph_p_homo)
    return gene_ids_ph_p_homo
    

In [10]:
def assign_unphased(alleleDf, gff_fh=anno_gff_p_fh, homo_bed_fh=homo_bed_fh):
    alleleDf = alleleDf.copy()
    _ = allgffTogenegff(gff_fh)
    Unphasedgenes = geneUnphased(gff_fh.replace('anno', 'gene'), homo_bed_fh)
    alleleDf['unphased'] = False
    alleleDf.loc[alleleDf.Query.isin(Unphasedgenes), 'unphased'] = True
    #print(len(Unphasedgenes))
    return alleleDf

In [11]:
def getFastaDict(fastaFile):
    d = {}
    for gene in SeqIO.parse(fastaFile, 'fasta'):
        d[gene.id] = gene
    return d

In [12]:
def writeAllelicFasta(alleleOne, alleleTwo, alleleType, outPath):
    '''writes fasta file containing fasta information for two alleles
    in the outPath'''
    assert(alleleType.upper() in ['CDS', 'GENE', 'PROTEIN'])
    
    seqRecordDict = globals()['SEQRECORD_' + alleleType.upper() + '_DICT']
    try:
        alleleSeqRecords = [seqRecordDict[alleleOne], seqRecordDict[alleleTwo]]
    except KeyError:
        print(alleleOne)
        print(alleleTwo)
        print(alleleType)
        sys.exit()
    with open(os.path.join(outPath, alleleType.lower() + '.fa'), 'w') as outFile:
        SeqIO.write(alleleSeqRecords, outFile, 'fasta')
    return True

def writeAlignmentScript(alleleOutPath, scriptLoc = os.path.join(PAML_PATH, 'paml_script.sh')):
    with open(scriptLoc, 'a') as outFile:
        print('cd %s' % alleleOutPath, file=outFile)
        print('/home/gamran/anaconda3/muscle3.8.31_i86linux64 -clwstrict -in protein.fa -out protein.aln', file=outFile)
        print('perl /home/gamran/anaconda3/pal2nal.v14/pal2nal.pl -output paml protein.aln cds.fa > cds_codon.aln', file=outFile)
        print('perl /home/gamran/anaconda3/pal2nal.v14/pal2nal.pl protein.aln cds.fa > cds_codon.clustal', file=outFile)
        print('cp %s/yn00.ctl ./' % PAML_PATH, file=outFile)
        print('/home/gamran/anaconda3/paml4.9g/bin/yn00', file=outFile)
    return True

In [13]:
def prepareAlignmentBashScript(scriptLoc = os.path.join(PAML_PATH, 'paml_script.sh')):
    with open(scriptLoc, 'w') as pamlScript:
        print('#!/bin/bash', file=pamlScript)

    for index, [Query, Target] in alleleDf.iloc[:, :2].iterrows():
        #if we don't have a blast hit skip.
        if pd.isnull(Target):
            continue
        else:
            alleleOutPath = os.path.join(PAML_PATH, '%s_%s' % (Query, Target))
            if not os.path.exists(alleleOutPath):
                os.mkdir(os.path.join(PAML_PATH, '%s_%s' % (Query, Target)))

            writeAllelicFasta(Query, Target, 'CDS', alleleOutPath)
            writeAllelicFasta(Query, Target, 'PROTEIN', alleleOutPath)

            writeAlignmentScript(alleleOutPath, os.path.join(PAML_PATH, 'paml_script.sh'))

In [216]:
def assignDistancesToAlleles(folder, alignmentFile, alleleType):
    '''Adds Hamming and Levenshtein distance columns to an allele pair
    (indexed by 'folder' name) in df'''
    #print(folder)
    if pd.isnull(folder):
        return np.nan, np.nan
    assert(alleleType.upper() in ['PROTEIN', 'CDS', 'GENE'])
    seq1, seq2 = AlignIO.read(open(alignmentFile, 'r'), format='clustal', seq_count=2)
    seq1 = str(seq1.seq).upper()
    seq2 = str(seq2.seq).upper()
    assert(len(seq1) == len(seq2))
    return editdistance.eval(seq1, seq2)/len(seq1), distance.hamming(seq1, seq2, normalized=True)

def assignDistancesToAllAlleles(df_folder_index, all_folders, tmp_path, suffix):
    """
    Reads in the index that contains the folder pairings for the alignements.
    Returns a protein_df and CDS_df that contain the hamming and levenshtein distance each.
    """
    cleaned_index = [x for x in df_folder_index if x in all_folders]
    #df = df.loc[cleaned_index, :]
    #df = df.loc[df.index.dropna(), :]
    #print(df.index)
    count = 0
    total = len(df_folder_index)
    percentDone = 0
    protein_lev_dict = {}
    protein_ham_dict = {}
    CDS_lev_dict = {}
    CDS_ham_dict = {}
    
    print("Calculating distances and adding them to the allele DataFrame...")
    
    for folder in cleaned_index:
        if pd.isnull(folder):
            proteinAlignmentFile = ''
            cdsAlignmentFile = ''
        else:
            proteinAlignmentFile = os.path.join(PAML_PATH, folder, 'protein.aln')
            cdsAlignmentFile = os.path.join(PAML_PATH, folder, 'cds_codon.clustal')
        #here the nan get overwritten. This doesn't matter though as they are all
        #nan anyway.
        protein_lev_dict[folder], protein_ham_dict[folder]  = \
        assignDistancesToAlleles(folder, proteinAlignmentFile, 'PROTEIN')
        CDS_lev_dict[folder], CDS_ham_dict[folder]  = \
        assignDistancesToAlleles(folder, cdsAlignmentFile, 'CDS')

        count += 1
        #if round(count/total * 100) > percentDone:
            #percentDone = round(count/total * 100)
            #print("%s%% complete" % percentDone)
            
    newdf_columns=['protein_hamming', 'protein_levenshtein', 'cds_hamming',
       'cds_levenshtein']
    if len(protein_ham_dict) > 0:
        df = pd.DataFrame([protein_ham_dict,protein_lev_dict,CDS_ham_dict,CDS_lev_dict]).T
        df.rename(columns=dict(zip(df.columns,newdf_columns)),inplace=True)
        out_name = os.path.join(tmp_path, '%s_%s.%s' % (df.index[0],df.index[-1],suffix))
        df.to_csv(out_name, sep='\t')

In [29]:
def parse_dNdS_to_df(line, alleleDf, folder, dNdS_label):
    dN = re.findall(r'dN = [-| ]?(.*) w', line)[0]
    dS = re.findall(r'dS = [-| ]?(.*) dN', line)[0]
    return assign_dNdS(dN, dS, alleleDf, folder, dNdS_label)

def assign_dNdS(dN, dS, alleleDf, folder, dNdS_label):
    if float(dS) > 0:
        alleleDf.loc[folder, dNdS_label] = float(dN)/float(dS)
    else:
        alleleDf.loc[folder, dNdS_label] = np.nan
    return alleleDf

def assign_dNdS_to_all_alleles(alleleDf):
    for folder in alleleDf.index:
        if pd.isnull(folder):
            continue
        alleleYn = os.path.join(PAML_PATH, folder,'yn.out')
        with open(alleleYn, 'r') as ynOut:
            #now loop over the lines and parse out stuff
            for i, line in enumerate(ynOut):
                if line.startswith('seq. seq. ') and i > 0:
                    next(ynOut) # we want the line that is two after the line starting with 'seq. seq '
                    dataLine = next(ynOut)
                    dN = dataLine.split('+-')[0].rstrip().split(' ')[-1]
                    dS = dataLine.split('+-')[1].rstrip().split(' ')[-1]
                    alleleDf = assign_dNdS(dN, dS, alleleDf, folder, 'yn00_dN/dS')
                elif line.startswith('LWL85:') and 'nan' not in line:
                    alleleDf = parse_dNdS_to_df(line, alleleDf, folder, 'LWL85_dN/dS')
                elif line.startswith('LWL85m:') and 'nan' not in line:
                    alleleDf = parse_dNdS_to_df(line, alleleDf, folder, 'LWL85m_dN/dS')
                elif line.startswith('LPB93:') and 'nan' not in line:
                    alleleDf = parse_dNdS_to_df(line, alleleDf, folder, 'LPB93_dN/dS')
                else:
                    continue
    return alleleDf

In [16]:
def checkPamlFilesExist(alleleDf):
    '''loops through all folder names in alleleDf.index to check if their PAML files have
    all been generated in those folders. refDict is based on the contents of a folder
    that was known to be run successfully.'''
    refDict = {'aln': 2,
     'clustal': 1,
     'ctl': 1,
     'dN': 1,
     'dS': 1,
     'fa': 2,
     'out': 1,
     'rst': 1,
     'rst1': 1,
     'rub': 1,
     't': 1}
    for file in (x for x in alleleDf.index if not pd.isnull(x)):
        if not os.path.exists(os.path.join(PAML_PATH, file)):
            return False
        discrepancies = getDiscrepancies(os.path.join(PAML_PATH, file), refDict)
        if discrepancies != '':
            print(discrepancies)
            return False
    return True

In [17]:
def grouper(iterable, n, fillvalue=None):
    "Collect data into fixed-length chunks or blocks"
    # grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx"
    args = [iter(iterable)] * n
    return it.zip_longest(*args, fillvalue=fillvalue)

In [213]:
def combineTmpToDf(header, suffix, tmp_path, clean=True):
    """Combines the files of a temporary folder into a dataframe based on the tmp files
    suffix. Returns combined dataframe. And cleans up if needed."""
    tmp_assigneddfs_fh = [os.path.join(tmp_path, file) for file in os.listdir(tmp_path)\
                         if file.endswith(suffix) ]
    tmp_df = pd.DataFrame(columns=header)
    for df_fh in tmp_assigneddfs_fh:
        tmp_df = pd.concat([distdf, pd.read_csv(df_fh, index_col = 0, sep='\t')])
    if clean == True:
        #now clean up again
        for file in tmp_assigneddfs_fh:
            os.remove(file)
    return tmp_df


In [18]:
hFullAlleleDf = pd.read_csv(os.path.join(ALLELE_PATH, '%s.full_df.alleles' % H_GENOME), header=0, sep='\t')
hFullAlleleDf['matchType'] = pd.Series(index=hFullAlleleDf.index)
hFullAlleleDf['matchType'] = hFullAlleleDf.apply(lambda row: assignMatchType(row['allele_source'], row['t_contig == h_contig_overlap'], row['q_contig == t_contig']), axis=1)
pFullAlleleDf = pd.read_csv(os.path.join(ALLELE_PATH, '%s.full_df.alleles' % P_GENOME), header=0, sep='\t')
pFullAlleleDf['matchType'] = pd.Series(index=pFullAlleleDf.index)
pFullAlleleDf['matchType'] = pFullAlleleDf.apply(lambda row: assignMatchType(row['allele_source'], row['t_contig == h_contig_overlap'], row['q_contig == t_contig']), axis=1)

In [19]:
# filter out haplotig proteins that already have alleles identified by BLAST or proteinortho.
hFullAlleleDf = hFullAlleleDf[(~hFullAlleleDf['Query'].isin(pFullAlleleDf['Target']))]
pFullAlleleDf['aQuery'] = pFullAlleleDf['Query']
pFullAlleleDf['aTarget'] = pFullAlleleDf['Target']
hFullAlleleDf['aQuery'] = hFullAlleleDf['Target']
hFullAlleleDf['aTarget'] = hFullAlleleDf['Query']

In [20]:
phFullAlleleDf = pFullAlleleDf.append(hFullAlleleDf)

In [21]:
SEQRECORD_PROTEIN_DICT = getFastaDict(PH_PROTEIN_FASTA)
SEQRECORD_GENE_DICT = getFastaDict(PH_GENE_FASTA)
SEQRECORD_CDS_DICT = getFastaDict(PH_CDS_FASTA)

In [198]:
alleleDf = phFullAlleleDf
alleleDf['folder'] = alleleDf.Query + '_' + alleleDf.Target
alleleDf.set_index('folder', inplace=True)
# assert(len(alleleDf) == len(overlapDf) + len(noOverlapDf) + len(diffContigDf) + len(manualAssignDf))

In [23]:
os.chdir('/home/gamran/genome_analysis/Warrior/Richard/scripts')
%run file_counting.ipynb


In [217]:
def main(alleleDf = alleleDf):
    prepareAlignmentBashScript(os.path.join(PAML_PATH, 'paml_script.sh'))
    
    # if already run before, comment out this line
    print("Checking whether all PAML files already exist in %s..." % PAML_PATH)
    if checkPamlFilesExist(alleleDf):
        print('PAML appears to have been run to completion previously. Therefore, it will not be run this time.')
    else:
        'Not all files generated by PAML appear to exist. Running PAML now (this may take some time)...'
        !bash {os.path.join(PAML_PATH, 'paml_script.sh')}
        print('PAML finished running.')

    analysedAllelesPath = os.path.join(BASE_OUT_PATH, GENOME+'_analysed_alleles.df')    
    alleleDf.to_csv(analysedAllelesPath, sep='\t')
    #dataframe where the index is not 'NaN'
    noNANdf = alleleDf.loc[~alleleDf.index.isnull(),:].copy()
    all_folders = noNANdf.index
    #generate a tmp folder for the parallized analysis
    tmp_path = os.path.join(BASE_OUT_PATH, 'tmp')
    if not os.path.exists(tmp_path):
        os.mkdir(tmp_path)
    #assign the distances
    dist_suffix = 'distdftmp'
    #do parallized analysi
    Parallel(n_jobs=threads)(delayed(assignDistancesToAllAlleles)(list(folder_index_list),all_folders,tmp_path, dist_suffix)\
                       for folder_index_list in grouper(noNANdf.index, 100, np.nan))
    distdf_header = ['protein_hamming', 'protein_levenshtein', 'cds_hamming',
       'cds_levenshtein']
    distdf = combineTmpToDf(distdf_header, dist_suffix, tmp_path)
    distdf['Index'] = distdf.index
    noNANdf['Index'] = noNANdf.index
    tmp_df = pd.merge(noNANdf, distdf,how='inner')
    tmp_df['folder'] = tmp_df.Query + '_' + tmp_df.Target
    tmp_df.set_index('folder', inplace=True)
    alleleDf.to_csv(analysedAllelesPath, sep='\t')
    #pd.util.testing.assert_frame_equal(alleleDf, pd.read_csv(analysedAllelesPath, sep='\t', index_col=0))
    
    
    #now assign the dNdS ratios
    
    alleleDf = assign_dNdS_to_all_alleles(alleleDf)
    
    aleleDf = assign_unphased(alleleDf)
    
    
    
    tmp_df.drop('Index', axis=1, inplace = True)
    alleleDf = pd.concat([tmp_df, alleleDf.loc[alleleDf.index.isnull(),:]])
    allele_Df.reset_index(drop=True, inplace=True)
    alleleDf.to_csv(analysedAllelesPath, sep='\t')
    
    return alleleDf, distdf

In [156]:
if __name__ == "__main__":
    alleleDf, distdf = main()

Checking whether all PAML files already exist in /home/benjamin/genome_assembly/Warrior/allele_analysis/v04/post_analysis/paml...
PAML appears to have been run to completion previously. Therefore, it will not be run this time.
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFr

Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and adding them to the allele DataFrame...
Calculating distances and

In [199]:
tmp_path = os.path.join(BASE_OUT_PATH, 'tmp')
tmp_assigneddfs_fh = [os.path.join(tmp_path, file) for file in os.listdir(tmp_path)\
                         if file.endswith('dftmp') ]

In [200]:
distdf_header = ['protein_hamming', 'protein_levenshtein', 'cds_hamming',
       'cds_levenshtein']
distdf = pd.DataFrame(columns=distdf_header)
for df_fh in tmp_assigneddfs_fh:
    distdf = pd.concat([distdf, pd.read_csv(df_fh, index_col = 0, sep='\t')])
distdf['Index'] = distdf.index
alleleDf['Index'] = alleleDf.index

In [208]:
tmp_df = pd.merge(alleleDf, distdf,how='inner')

In [206]:
tmp_df.drop('Index', axis=1, inplace = True)

Unnamed: 0,Query,Target,PctID,AlnLgth,NumMis,NumGap,StartQuery,StopQuery,StartTarget,StopTarget,...,t_contig == h_contig_overlap,allele_source,matchType,aQuery,aTarget,Index,protein_hamming,protein_levenshtein,cds_hamming,cds_levenshtein
0,evm.model.pcontig_000.100,evm.model.hcontig_072_002.3,25.00,240.0,155.0,8.0,110.0,330.0,125.0,358.0,...,False,BLAST,BLAST,evm.model.pcontig_000.100,evm.model.hcontig_072_002.3,evm.model.pcontig_000.100_evm.model.hcontig_07...,0.836344,0.818278,0.713780,0.621325
1,evm.model.pcontig_000.1001,evm.model.hcontig_029_007.1,70.27,37.0,11.0,0.0,130.0,166.0,52.0,88.0,...,False,BLAST,BLAST,evm.model.pcontig_000.1001,evm.model.hcontig_029_007.1,evm.model.pcontig_000.1001_evm.model.hcontig_0...,0.777778,0.777778,0.697856,0.654971
2,evm.model.pcontig_000.1002,evm.model.hcontig_003_006.10,96.88,128.0,4.0,0.0,1.0,128.0,83.0,210.0,...,False,BLAST,BLAST,evm.model.pcontig_000.1002,evm.model.hcontig_003_006.10,evm.model.pcontig_000.1002_evm.model.hcontig_0...,0.411483,0.411483,0.403509,0.403509
3,evm.model.pcontig_000.1003,evm.model.hcontig_003_006.9,96.95,164.0,5.0,0.0,1.0,164.0,345.0,508.0,...,False,BLAST,BLAST,evm.model.pcontig_000.1003,evm.model.hcontig_003_006.9,evm.model.pcontig_000.1003_evm.model.hcontig_0...,0.688363,0.688363,0.683761,0.683761
4,evm.model.pcontig_000.1005,evm.model.hcontig_000_078.13,39.13,92.0,56.0,0.0,25.0,116.0,1.0,92.0,...,True,BLAST,BLAST,evm.model.pcontig_000.1005,evm.model.hcontig_000_078.13,evm.model.pcontig_000.1005_evm.model.hcontig_0...,0.709677,0.701613,0.634409,0.602151
5,evm.model.pcontig_000.1006,evm.model.hcontig_003_006.7,92.91,127.0,9.0,0.0,1.0,127.0,1.0,127.0,...,False,BLAST,BLAST,evm.model.pcontig_000.1006,evm.model.hcontig_003_006.7,evm.model.pcontig_000.1006_evm.model.hcontig_0...,0.514403,0.514403,0.495199,0.495199
6,evm.model.pcontig_000.1007,evm.model.hcontig_000_078.16,100.00,391.0,0.0,0.0,1.0,391.0,1.0,391.0,...,True,BLAST,BLAST,evm.model.pcontig_000.1007,evm.model.hcontig_000_078.16,evm.model.pcontig_000.1007_evm.model.hcontig_0...,0.111111,0.111111,0.109599,0.108088
7,evm.model.pcontig_000.1009,evm.model.hcontig_020_005.32,63.64,77.0,28.0,0.0,51.0,127.0,317.0,393.0,...,False,BLAST,BLAST,evm.model.pcontig_000.1009,evm.model.hcontig_020_005.32,evm.model.pcontig_000.1009_evm.model.hcontig_0...,0.870674,0.867031,0.844566,0.829994
8,evm.model.pcontig_000.101,evm.model.hcontig_029_016.1,78.38,111.0,24.0,0.0,1.0,111.0,1.0,111.0,...,False,BLAST,BLAST,evm.model.pcontig_000.101,evm.model.hcontig_029_016.1,evm.model.pcontig_000.101_evm.model.hcontig_02...,0.218182,0.209091,0.136364,0.133333
9,evm.model.pcontig_000.1010,evm.model.hcontig_000_078.17,99.09,110.0,1.0,0.0,39.0,148.0,19.0,128.0,...,True,BLAST,BLAST,evm.model.pcontig_000.1010,evm.model.hcontig_000_078.17,evm.model.pcontig_000.1010_evm.model.hcontig_0...,0.244898,0.238095,0.229025,0.197279


In [191]:
len(alleleDf.loc[~alleleDf.index.isnull(),:]) == len(distdf.loc[:,:])

True

In [192]:
alleleDf

Unnamed: 0,AlnLgth,BitScore,Index,NumGap,NumMis,PctID,QCov,QLgth,Query,StartQuery,...,e-value,h_contig_overlap,matchType,p_protein,protein_hamming,protein_levenshtein,q_contig,q_contig == t_contig,t_contig,t_contig == h_contig_overlap
0.0,240.0,50.4,0.0,8.0,155.0,25.00,27.180068,883,evm.model.pcontig_000.100,110.0,...,1.000000e-06,['hcontig_000_058'],BLAST,evm.model.pcontig_000.100,0.8363443145589798,0.8182784272051009,pcontig_000,False,hcontig_072_002,False
1.0,37.0,53.5,1.0,0.0,11.0,70.27,21.511628,172,evm.model.pcontig_000.1001,130.0,...,2.000000e-10,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1001,0.7777777777777778,0.7777777777777778,pcontig_000,False,hcontig_029_007,False
2.0,128.0,258.0,2.0,0.0,4.0,96.88,100.000000,128,evm.model.pcontig_000.1002,1.0,...,1.000000e-88,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1002,0.41148325358851673,0.41148325358851673,pcontig_000,False,hcontig_003_006,False
3.0,164.0,315.0,3.0,0.0,5.0,96.95,100.000000,164,evm.model.pcontig_000.1003,1.0,...,2.000000e-106,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1003,0.6883629191321499,0.6883629191321499,pcontig_000,False,hcontig_003_006,False
4.0,92.0,58.2,4.0,0.0,56.0,39.13,78.632479,117,evm.model.pcontig_000.1005,25.0,...,3.000000e-12,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1005,0.7096774193548387,0.7016129032258065,pcontig_000,True,hcontig_000_078,True
5.0,127.0,238.0,5.0,0.0,9.0,92.91,52.049180,244,evm.model.pcontig_000.1006,1.0,...,5.000000e-80,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1006,0.51440329218107,0.51440329218107,pcontig_000,False,hcontig_003_006,False
6.0,391.0,821.0,6.0,0.0,0.0,100.00,98.488665,397,evm.model.pcontig_000.1007,1.0,...,0.000000e+00,"['hcontig_000_078', 'hcontig_000_060']",BLAST,evm.model.pcontig_000.1007,0.1111111111111111,0.1111111111111111,pcontig_000,True,hcontig_000_078,True
7.0,77.0,112.0,7.0,0.0,28.0,63.64,47.530864,162,evm.model.pcontig_000.1009,51.0,...,1.000000e-29,"['hcontig_000_060', 'hcontig_000_078']",BLAST,evm.model.pcontig_000.1009,0.8706739526411658,0.8670309653916212,pcontig_000,False,hcontig_020_005,False
8.0,111.0,183.0,8.0,0.0,24.0,78.38,100.000000,111,evm.model.pcontig_000.101,1.0,...,1.000000e-60,['hcontig_000_058'],BLAST,evm.model.pcontig_000.101,0.21818181818181817,0.20909090909090908,pcontig_000,False,hcontig_029_016,False
9.0,110.0,228.0,9.0,0.0,1.0,99.09,74.324324,148,evm.model.pcontig_000.1010,39.0,...,2.000000e-77,"['hcontig_000_060', 'hcontig_000_078']",BLAST,evm.model.pcontig_000.1010,0.24489795918367346,0.23809523809523808,pcontig_000,True,hcontig_000_078,True


In [138]:
pd.concat([tmp_df, alleleDf.loc[alleleDf.index.isnull(),:]])

Unnamed: 0,AlnLgth,BitScore,Index,NumGap,NumMis,PctID,QCov,QLgth,Query,StartQuery,...,e-value,h_contig_overlap,matchType,p_protein,protein_hamming,protein_levenshtein,q_contig,q_contig == t_contig,t_contig,t_contig == h_contig_overlap
0.0,240.0,50.4,evm.model.pcontig_000.100_evm.model.hcontig_07...,8.0,155.0,25.00,27.180068,883,evm.model.pcontig_000.100,110.0,...,1.000000e-06,['hcontig_000_058'],BLAST,evm.model.pcontig_000.100,0.8363443145589798,0.8182784272051009,pcontig_000,False,hcontig_072_002,False
1.0,37.0,53.5,evm.model.pcontig_000.1001_evm.model.hcontig_0...,0.0,11.0,70.27,21.511628,172,evm.model.pcontig_000.1001,130.0,...,2.000000e-10,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1001,0.7777777777777778,0.7777777777777778,pcontig_000,False,hcontig_029_007,False
2.0,128.0,258.0,evm.model.pcontig_000.1002_evm.model.hcontig_0...,0.0,4.0,96.88,100.000000,128,evm.model.pcontig_000.1002,1.0,...,1.000000e-88,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1002,0.41148325358851673,0.41148325358851673,pcontig_000,False,hcontig_003_006,False
3.0,164.0,315.0,evm.model.pcontig_000.1003_evm.model.hcontig_0...,0.0,5.0,96.95,100.000000,164,evm.model.pcontig_000.1003,1.0,...,2.000000e-106,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1003,0.6883629191321499,0.6883629191321499,pcontig_000,False,hcontig_003_006,False
4.0,92.0,58.2,evm.model.pcontig_000.1005_evm.model.hcontig_0...,0.0,56.0,39.13,78.632479,117,evm.model.pcontig_000.1005,25.0,...,3.000000e-12,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1005,0.7096774193548387,0.7016129032258065,pcontig_000,True,hcontig_000_078,True
5.0,127.0,238.0,evm.model.pcontig_000.1006_evm.model.hcontig_0...,0.0,9.0,92.91,52.049180,244,evm.model.pcontig_000.1006,1.0,...,5.000000e-80,['hcontig_000_078'],BLAST,evm.model.pcontig_000.1006,0.51440329218107,0.51440329218107,pcontig_000,False,hcontig_003_006,False
6.0,391.0,821.0,evm.model.pcontig_000.1007_evm.model.hcontig_0...,0.0,0.0,100.00,98.488665,397,evm.model.pcontig_000.1007,1.0,...,0.000000e+00,"['hcontig_000_078', 'hcontig_000_060']",BLAST,evm.model.pcontig_000.1007,0.1111111111111111,0.1111111111111111,pcontig_000,True,hcontig_000_078,True
7.0,77.0,112.0,evm.model.pcontig_000.1009_evm.model.hcontig_0...,0.0,28.0,63.64,47.530864,162,evm.model.pcontig_000.1009,51.0,...,1.000000e-29,"['hcontig_000_060', 'hcontig_000_078']",BLAST,evm.model.pcontig_000.1009,0.8706739526411658,0.8670309653916212,pcontig_000,False,hcontig_020_005,False
8.0,111.0,183.0,evm.model.pcontig_000.101_evm.model.hcontig_02...,0.0,24.0,78.38,100.000000,111,evm.model.pcontig_000.101,1.0,...,1.000000e-60,['hcontig_000_058'],BLAST,evm.model.pcontig_000.101,0.21818181818181817,0.20909090909090908,pcontig_000,False,hcontig_029_016,False
9.0,110.0,228.0,evm.model.pcontig_000.1010_evm.model.hcontig_0...,0.0,1.0,99.09,74.324324,148,evm.model.pcontig_000.1010,39.0,...,2.000000e-77,"['hcontig_000_060', 'hcontig_000_078']",BLAST,evm.model.pcontig_000.1010,0.24489795918367346,0.23809523809523808,pcontig_000,True,hcontig_000_078,True


In [134]:
alleleDf.head()

Unnamed: 0_level_0,Query,Target,PctID,AlnLgth,NumMis,NumGap,StartQuery,StopQuery,StartTarget,StopTarget,...,t_contig,q_contig == t_contig,p_protein,h_contig_overlap,t_contig == h_contig_overlap,allele_source,matchType,aQuery,aTarget,Index
folder,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,evm.model.pcontig_000.10,,,,,,,,,,...,False,False,,False,False,BLAST,BLAST,evm.model.pcontig_000.10,,
evm.model.pcontig_000.100_evm.model.hcontig_072_002.3,evm.model.pcontig_000.100,evm.model.hcontig_072_002.3,25.0,240.0,155.0,8.0,110.0,330.0,125.0,358.0,...,hcontig_072_002,False,evm.model.pcontig_000.100,['hcontig_000_058'],False,BLAST,BLAST,evm.model.pcontig_000.100,evm.model.hcontig_072_002.3,evm.model.pcontig_000.100_evm.model.hcontig_07...
evm.model.pcontig_000.1001_evm.model.hcontig_029_007.1,evm.model.pcontig_000.1001,evm.model.hcontig_029_007.1,70.27,37.0,11.0,0.0,130.0,166.0,52.0,88.0,...,hcontig_029_007,False,evm.model.pcontig_000.1001,['hcontig_000_078'],False,BLAST,BLAST,evm.model.pcontig_000.1001,evm.model.hcontig_029_007.1,evm.model.pcontig_000.1001_evm.model.hcontig_0...
evm.model.pcontig_000.1002_evm.model.hcontig_003_006.10,evm.model.pcontig_000.1002,evm.model.hcontig_003_006.10,96.88,128.0,4.0,0.0,1.0,128.0,83.0,210.0,...,hcontig_003_006,False,evm.model.pcontig_000.1002,['hcontig_000_078'],False,BLAST,BLAST,evm.model.pcontig_000.1002,evm.model.hcontig_003_006.10,evm.model.pcontig_000.1002_evm.model.hcontig_0...
evm.model.pcontig_000.1003_evm.model.hcontig_003_006.9,evm.model.pcontig_000.1003,evm.model.hcontig_003_006.9,96.95,164.0,5.0,0.0,1.0,164.0,345.0,508.0,...,hcontig_003_006,False,evm.model.pcontig_000.1003,['hcontig_000_078'],False,BLAST,BLAST,evm.model.pcontig_000.1003,evm.model.hcontig_003_006.9,evm.model.pcontig_000.1003_evm.model.hcontig_0...


In [None]:
########## FIGURE PLOTTING ##########

def make_autopct(values):
    def my_autopct(pct):
        total = sum(values)
        val = int(round(pct*total/100.0))
        return '{p:.2f}%\n({v:d})'.format(p=pct,v=val)
    return my_autopct

def autolabel(rects, labels, ax, fontsize):
    """
    Attach a text label above each bar displaying its height
    """
    for i, rect in enumerate(rects):
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2., height, str(labels[i]), ha='center', va='bottom', fontsize=fontsize)

def getNumNoAlleles(pProteinFastaFile, alleleDf):
    with open(pProteinFastaFile) as pProteinFasta:
        pProteinList = []
        for line in pProteinFasta:
            if line.startswith('>'):
                pProteinList.append(line[1:].strip())

    assert(len(pProteinList) == len(set(pProteinList)))


    pairedPProteinList = list(alleleDf['Query'])
    pairedPProteinList += list(alleleDf['Target'])
    pairedPProteinList = set(pairedPProteinList)
    
    for pairedPProtein in pairedPProteinList:
        if pairedPProtein in pProteinList:
            pProteinList.remove(pairedPProtein)
    
    return len(pProteinList)

def plotAlleleTypesPie(ax, alleleDf, colors, includeNoAlleles=True):
    '''Plots a pie chart of allele types, with the option of also including 
    primary proteins with no alleles. Strictly, this is not an accurate representation
    of the distribution of primary proteins as the reciprocal BLAST-identified (h on p) alleles
    may result in double-counting of primary proteins.
    '''
    # OrderedDict to preserve order, so that plots are coloured with same key as the distance 
    # bar graphs. This is a bit of a hack-fix; must enter these by hand again in the same order 
    # as 'matchType' occurs in the alleleAveragesByMatchType DataFrame.
    alleleTypeCountDict = collections.OrderedDict()
    
    for matchType in alleleDf['matchType'].unique():
        alleleTypeCountDict[matchType] = len(alleleDf[alleleDf['matchType'] == matchType])
    
    if includeNoAlleles==True:
        numNoAlleles = getNumNoAlleles(P_PROTEINS_FASTA, alleleDf)
        alleleTypeCountDict['no_allele'] = numNoAlleles

    patches, texts, autotexts = ax.pie(list(alleleTypeCountDict.values()), labels=alleleTypeCountDict.keys(), autopct=make_autopct(list(alleleTypeCountDict.values())), colors=colors)
    ax.axis('equal')
    ax.set_title('Allele Types', loc='center', fontsize=TITLE_SIZE, position=(0.5, 1.1))

def plotLevenshteinBar(alleleAverages, ax, colors):
    '''Plots a bar graph of normalised Levenshtein distances on ax from DataFrame alleleAverages.'''
    
    ind = np.arange(len(alleleAverages.protein_levenshtein))
    rects = ax.bar(ind, alleleAverages.protein_levenshtein, 0.35, color=colors, align='center') 
    
    sns.despine(top=True, right=True)
    
    barLabels = []
    for levDist in alleleAverages.protein_levenshtein:
        barLabels.append(str(int((1-levDist)*100)) + '%')
    autolabel(rects, barLabels, ax, INLINE_LABEL_SIZE)

    ax.set_xticks(ind)
    ax.set_xticklabels(alleleAverages.index, rotation=45)

    # ax.set_xlabel('Allele Types', fontsize=AXIS_LABEL_SIZE)
    ax.set_ylabel('Normalised Levenshtein Distance', fontsize=AXIS_LABEL_SIZE)

    ax.tick_params(axis='both', which='major', labelsize=AXIS_TICK_SIZE, pad=3)

    for tick in ax.get_xaxis().get_major_ticks():
        tick.set_pad(2*tick.get_pad())
        tick.label1 = tick._get_text1()
        
def plotAlleles(alleleDf, qCovFilters, tCovFilters, pctIdFilters, levSimFilters, leavePO):
    '''Makes a 3x2 plot with normalised Levenshtein distance plots in column 1 and
    a pie chart representing the distribution of allele types in column 2.
    Each row shows different levels of filtering.
    
    leavePO is a boolean that determines whether only BLAST hits will be filtered (leavePO=True)
    or both BLAST and PO alleles should be filtered (leavePO=False)'''
    cmap = plt.cm.Greens
    colors = cmap(np.linspace(0.0, 0.6, len(alleleDf['matchType'].unique())))
    
    assert(len(qCovFilters) == len(pctIdFilters) == len(levSimFilters))
    
    fig, ax = plt.subplots(len(qCovFilters), 2, figsize=(30, 12*len(qCovFilters)))
    
    for i in range(len(qCovFilters)):
        
        filteredAlleleDf = filterAlleleDf(alleleDf, qCovFilters[i], tCovFilters[i], pctIdFilters[i], levSimFilters[i], True)
        print(filteredAlleleDf[filteredAlleleDf.allele_source == 'PO'].shape[0])
        # levenshtein distance plot
        alleleAveragesByMatchType = filteredAlleleDf.groupby(['matchType']).mean()
        plotLevenshteinBar(alleleAveragesByMatchType, ax[i, 0], colors)
        ax[i, 0].set_xticklabels(alleleAveragesByMatchType.index, rotation=45, ha='right')
        
        # pie plot
        plotAlleleTypesPie(ax[i, 1], filteredAlleleDf, colors)
        
        # include filtering criteria in title
        qCovFilter = qCovFilters[i]
        tCovFilter = tCovFilters[i]
        pctIdFilter = pctIdFilters[i]
        levSimFilter = levSimFilters[i]
        if qCovFilter < BASE_QCOV_CUTOFF:
            qCovFilter = BASE_QCOV_CUTOFF
            print('Base QCov cut-off is 70%; if you desire to filter below this value, decrease BASE_QCOV_CUTOFF.')
        if tCovFilter < BASE_TCOV_CUTOFF:
            tCovFilter = BASE_TCOV_CUTOFF
            print('Base TCov cut-off is 70%; if you desire to filter below this value, decrease BASE_TCOV_CUTOFF.')
        if pctIdFilter < BASE_PCTID_CUTOFF:
            pctIdFilter = BASE_PCTID_CUTOFF
            print('Base %ID cut-off is 70%; if you desire to filter below this value, decrease BASE_PCTID_CUTOFF.')
        if not levSimFilter:
            levSimFilter = 0
    
        ax[i, 0].set_title('QCov > %s%%, TCov > %s%%, ID > %s%%, L. sim. > %s%%, PO Filtered: %s' % (qCovFilter, tCovFilter, pctIdFilter, levSimFilter, not leavePO), position=(0.5, 0.85))
        
    fig.tight_layout()
    fig.savefig(os.path.join(FIGURE_PATH, 'fig'), bbox_inches='tight')

In [None]:
# Used in the pie chart for all text except title
# the ax.pie plotting interface is weird - cannot set other font sizes properly?
mpl.rcParams['font.size'] = 24

TITLE_SIZE = 32
AXIS_LABEL_SIZE = 28
AXIS_TICK_SIZE = 24
INLINE_LABEL_SIZE = 24

# These lists must all be the same length
qCovFilters = [False, 80, 90, 95]
pctIdFilters = [False, 80, 90, 95]
tCovFilters = [False, 80, 90, 95]
levSimFilters = [False, False, False, False]

plotAlleles(alleleDf, qCovFilters, tCovFilters, pctIdFilters, levSimFilters, True)