# NGS mapping  
  
This jupyter notebook maps the raw FASTQ sequencing reads of all samples to the given reference fasta (H3N2-Bris07 full genome).

# Import libraries

In [1]:
from os.path import expanduser
from importlib.machinery import SourceFileLoader

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

import pandas as pd 
import numpy as np
import re
import os
import subprocess

# load custom flu and ngs libraries 
laeb_lib = expanduser("../python_lib") # folder where custom libraries are saved 
fc = SourceFileLoader('fc', "%s/flu_common.py"%(laeb_lib)).load_module()
ngs = SourceFileLoader('ngs', "%s/laeb_ngs_pipeline.py"%(laeb_lib)).load_module()

# Inputs from user

In [2]:
# inputs 
# file path to data folder - fastq files to be analysed must be in {data folder}/raw
data_folder = './data' 
# reference fasta file name (should be placed in input_folder)
ref_fasta_fname = './input/H3N2_Bris07.fasta' 
# CSV file containing the CDR regions of each gene segment (numbering should be based on that of the given reference sequence)
cds_coords = "./input/CDS_H3N2_Bris07.csv"
# primer coordinates (sequence numbering must be based on given reference sequence)
primer_coords = "./input/H3N2_primer_coords.csv"
# file path to metadata file. 
meta_fname = './input/metadata.csv' 

# mapping options
trimmomatic_fpath = expanduser('~/opt/anaconda3/pkgs/trimmomatic-0.39-1/share/trimmomatic-0.39-1/') # file path to trimmomatic
threadnum = 4 # number of CPU threads for parallelization 
base_qual_threshold = 20 # minimum accepted base quality 
max_indel_prop = 0.05 # max tolerable proportion of indels wrt read length 
max_indel_abs = 10 # max tolerable absolute number of indels 

# variant calling options
Query_HAnum_subtype = 'absH3' # query HA numbering subtype (i.e. numbering based on CDR HA protein )
HAnum_subtype = 'H3' # reporting HA numbering subtype
subtype_ant = 'H3ant'  # HA canonical antigenic site of interest 
min_cov = 100 # minimum coverage 
min_var_freq = 0
min_var_prop = 0.02 # minimum variant proportion 
err_tol = 0.01 # threshold to which variant called could result from base calling error 
min_breadth = 0.7 # min breadth of gene segment to be mapped for further analysis 

# Parameters and functions 

This cell perform several initialisation procedures, including: 
 - defining parameters needed by the pipeline (e.g. gene segment length, etc.) and initialise to get CDR regions of each protein.
 - defining dataframe for HA numbering conversion

In [3]:
# presets 
reffasta = ref_fasta_fname

# initialise
gene_to_proteinorf, influenza_gene_len, sorted_refnames, nucpos_shift = ngs.initialisation(cds_coords, reffasta, laeb_lib)
display (gene_to_proteinorf.head())

ha_numbering_conversion = pd.read_csv(expanduser('%s/HA_numbering_conversion.csv'%(laeb_lib)),
                                      na_values='-')
ha_numbering_conversion = ha_numbering_conversion.set_index(Query_HAnum_subtype)
display (ha_numbering_conversion.head())

all_bases = ['A', 'T', 'G', 'C']


Initialising CDS coordinates...

Check translated protein sequences...
PB2 MERIKELRNLMSQSRTREILTKTTVDHMAIIKKYTSGRQEKNPSLRMKWMMAMKYPITADKRITEMVPERNEQGQTLWSKMSDAGSDRVMVSPLAVTWWNRNGPVTSTVHYPKVYKTYFDKVERLKHGTFGPVHFRNQVKIRRRVDINPGHADLSAKEAQDVIMEVVFPNEVGARILTSESQLTITKEKKEELRDCKISPLMVAYMLERELVRKTRFLPVAGGTSSIYIEVLHLTQGTCWEQMYTPGGGVRNDDVDQSLIIAARNIVRRAAVSADPLASLLEMCHSTQIGGTRMVDILRQNPTEEQAVDICKAAMGLRISSSFSFGGFTFKRTSGSSVKKEEEVLTGNLQTLKIRVHEGYEEFTMVGKRATAILRKATRRLVQLIVSGRDEQSIAEAIIVAMVFSQEDCMIKAVRGDLNFVNRANQRLNPMHQLLRHFQKDAKVLFQNWGVEHIDSVMGMIGVLPDMTPSTEMSMRGIRVSKMGVDEYSSTERVVVSIDRFLRVRDQRGNVLLSPEEVSETQGTERLTITYSSSMMWEINGPESVLVNTYQWIIRNWEAVKIQWSQNPAMLYNKMEFEPFQSLVPKAIRSQYSGFVRTLFQQMRDVLGTFDTTQIIKLLPFAAAPPKQSRMQFSSLTVNVRGSGMRILVRGNSPVFNYNKTTKRLTILGKDAGTLIEDPDESTSGVESAVLRGFLIIGKEDRRYGPALSINELSNLAKGEKANVLIGQGDVVLVMKRKRDSSILTDSQTATKRIRMAIN*
PB1 MDVNPTLLFIKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRTHQYSEKGKWTTNTETGAPQLNPIDGPLPEDNEPSGYAQTDCVLEAMAFLEESHPGIFENSCLETMEAVQQTRVDKLTQGRQTYDWTLNRNQPAATALANTIEVFRSNGLTANE

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,aa,frame
gene,nuc,protein,Unnamed: 3_level_1,Unnamed: 4_level_1
1-PB2,1,PB2,1,1
1-PB2,2,PB2,1,2
1-PB2,3,PB2,1,3
1-PB2,4,PB2,2,1
1-PB2,5,PB2,2,2


Unnamed: 0_level_0,H3,absH1pdm,absH5,H1N1pdm,H5,H5c221,H1ant,H3ant,RBS
absH3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1.0,,1.0,1.0,,,,,,0
2.0,,2.0,2.0,,,,,,0
3.0,,3.0,3.0,,,,,,0
4.0,,4.0,4.0,,,,,,0
5.0,,5.0,5.0,,,,,,0


## Read metadata

Sample IDs are parsed from metadata file under the header "sampid".

In [7]:
# metadata 
## metadata must have 'sampid' header which is used as sample identifier 
meta_df = pd.read_csv(meta_fname)

sorted_sampid = sorted(set(meta_df['sampid']))
meta_df['timepoint'] = meta_df['enrolD']+meta_df['enrol-onset']

meta_df = meta_df.sort_values(by=['subject_id', 'timepoint']).set_index('sampid')
meta_df['patch'] = 0
meta_df.at[meta_df.index.str.contains(r'_P$'), 'patch'] = 1

display (meta_df.head())

Unnamed: 0_level_0,idx,enrolD,ct,SampleType,subject_id,enrol-onset,date,age,timepoint,patch
sampid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
H3N2_1205_T0_T_S517_N701_R,R1,0,26,T,1205,6.0,1/8/07,2,6.0,0
H3N2_1205_T1_N_S517_N702_R,R2,1,25,N,1205,6.0,1/8/07,2,7.0,0
H3N2_1205_T2_T_S517_N703_R,R3,2,30,T,1205,6.0,1/8/07,2,8.0,0
H3N2_1205_T3_T_S517_N704_R,R4,3,29,T,1205,6.0,1/8/07,2,9.0,0
H3N2_1205_T4_T_S517_N705_R,R5,4,28,T,1205,6.0,1/8/07,2,10.0,0


In [8]:
# get path to raw FASTQ files sorted by read direction 
dat_df = ngs.generate_raw_fastq_df(sorted_sampid, data_folder)
display (dat_df.head())


Generate dataframe of input FASTQ files...


Unnamed: 0_level_0,Unnamed: 1_level_0,fpath
sampid,read,Unnamed: 2_level_1
H3N2_1205_T0_T_S517_N701_R,R1,./data/raw/H3N2_1205_T0_T_S517_N701_R_S1_L001_...
H3N2_1205_T0_T_S517_N701_R,R2,./data/raw/H3N2_1205_T0_T_S517_N701_R_S1_L001_...
H3N2_1205_T1_N_S517_N702_R,R1,./data/raw/H3N2_1205_T1_N_S517_N702_R_S2_L001_...
H3N2_1205_T1_N_S517_N702_R,R2,./data/raw/H3N2_1205_T1_N_S517_N702_R_S2_L001_...
H3N2_1205_T2_T_S517_N703_R,R1,./data/raw/H3N2_1205_T2_T_S517_N703_R_S3_L001_...


# Perform ```FASTQC```

```FastQC``` checks the quality of the raw FASTQ files (i.e. $\ge$90% of reads has above acceptable quality score), determine the crop length for ```trimmomatic``` and ensure that there are negligible amount of adapter sequences present. 

In [9]:
dat_df = ngs.pretrim_fastqc(dat_df, data_folder, base_qual_threshold)
display (dat_df.head())


Perform pre-trim FASTQC with minimum base quality 20 (change with --base_qual_threshold if needed)...

#-- Presence of adapter sequence (max. proportion of reads) --#
Nextera Transposase Sequence: 0.40%
Illumina Small RNA 3' Adapter: 0.00%
SOLID Small RNA Adapter: 0.00%
Illumina Universal Adapter: 0.00%
Illumina Small RNA 5' Adapter: 0.00%


Unnamed: 0_level_0,Unnamed: 1_level_0,fpath,percent_abv_qualthres,start_pos,end_pos
sampid,read,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
H3N2_1205_T0_T_S517_N701_R,R1,./data/raw/H3N2_1205_T0_T_S517_N701_R_S1_L001_...,0.99831,1.0,251.0
H3N2_1205_T0_T_S517_N701_R,R2,./data/raw/H3N2_1205_T0_T_S517_N701_R_S1_L001_...,0.997077,1.0,251.0
H3N2_1205_T1_N_S517_N702_R,R1,./data/raw/H3N2_1205_T1_N_S517_N702_R_S2_L001_...,0.996591,1.0,251.0
H3N2_1205_T1_N_S517_N702_R,R2,./data/raw/H3N2_1205_T1_N_S517_N702_R_S2_L001_...,0.995069,1.0,251.0
H3N2_1205_T2_T_S517_N703_R,R1,./data/raw/H3N2_1205_T2_T_S517_N703_R_S3_L001_...,0.995238,1.0,251.0


# Trim low quality ends

```Trimmomatic``` trims low quality ends and adapter sequences (if applicable). Crop length is determined by```FastQC``` earlier for each sample.

In [11]:
trimmed_df = ngs.trim_raw_fastq(dat_df, sorted_sampid, data_folder, trimmomatic_fpath, threadnum)
display (trimmed_df.head())


Trimming raw FASTQ sequences...


Unnamed: 0_level_0,total_pairs,both,both_prop,forward,reverse,dropped
sampid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
H3N2_1205_T0_T_S517_N701_R,188180,187863,0.998315,0,3,314
H3N2_1205_T1_N_S517_N702_R,212938,212211,0.996586,3,7,717
H3N2_1205_T2_T_S517_N703_R,205797,204817,0.995238,2,7,971
H3N2_1205_T3_T_S517_N704_R,216519,215527,0.995418,6,9,977
H3N2_1205_T4_T_S517_N705_R,199923,199171,0.996239,2,3,747


# Merge paired-end reads with ```FLASH```

In [13]:
merge_df = ngs.merge_reads(dat_df, data_folder, sorted_sampid)
display (merge_df.head())

Overall mean prop of pairs merged: 85.08% (SD: 12.47%)


Unnamed: 0_level_0,total,combined,percent_combined
sampid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
H3N2_1205_T0_T_S517_N701_R,187863,174351,0.928075
H3N2_1205_T1_N_S517_N702_R,212211,200015,0.942529
H3N2_1205_T2_T_S517_N703_R,204817,191516,0.935059
H3N2_1205_T3_T_S517_N704_R,215527,204893,0.95066
H3N2_1205_T4_T_S517_N705_R,199171,184334,0.925506


# Read mapping

```bowtie2``` to align the trimmed, merged reads to the reference sequence. 

Flags used for ```bowtie2```: 
```
-x <refid> : Reference sequence to align by 
-X <int>   : If -X 100, a two 20-bp alignment + 60-bp gap would be valid but not if there is 61-bp gap 
-k <int>   : Searches for at most <int> of valid, distinct alignment for each read 
--local    : Does not require that the entire read align from one end to the other. Rather, some characters may be omitted ("soft clipped") from the ends in order to achieve the greatest possible alignment score. 
--very-sensitive : Same as -D 20 -R 3 -N 0 -L 20 -i S,1,0.50
```

In [15]:
ngs.read_mapping_bt2(sorted_sampid, data_folder, influenza_gene_len, 
                     ref_fasta_fname, threadnum)


Index reference sequences...

Mapping reads with bowtie2...
...done.


# Combine SAM files for patch and initial samples

Some of the samples were re-sequenced (i.e. patch samples) due to low coverage for certain amplicons in the initial sequencing results. Here, we combined the SAM files for re-sequenced samples to their initial counterparts.

In [20]:
# reset meta_df index to subject_id and enrollment date
try:
    meta_df = meta_df.reset_index()
except: 
    pass 

for subject_id in set(meta_df['subject_id']): 
    subject_meta_df = meta_df[meta_df['subject_id']==subject_id]
    
    for enrolD in sorted(set(subject_meta_df['enrolD'])): 
        enrolD_subject_meta_df = subject_meta_df[subject_meta_df['enrolD']==enrolD].sort_values(by='patch')
        if len(enrolD_subject_meta_df) > 1: 
            sampid_list = list(enrolD_subject_meta_df['sampid'])
            new_sampid = sampid_list[0] + 'P'
            # concatenate sam files of initial and patch samples 
            if not os.path.isfile('{}/align/{}.sam.gz'.format(data_folder, new_sampid)):
                cmd = ['cat'] + ['{}/align/{}.sam.gz'.format(data_folder, sampid) for sampid in sampid_list] + ['>', '{}/align/{}.sam.gz'.format(data_folder, new_sampid)]
                subprocess.call(' '.join(cmd), shell=True)
            
            idx_list = list(enrolD_subject_meta_df.index)
            # remove patch entry from meta df 
            meta_df = meta_df.drop(idx_list[-1])
            # change initial entry with new_sampid 
            meta_df.at[idx_list[0], 'sampid'] = new_sampid
            
meta_df = meta_df.set_index('sampid').sort_index()
meta_df.to_csv("./input/metadata_patched.csv")
sorted_sampid = list(meta_df.index)            

# Parse SAM files 

Quality filters:  
- excluded all unmapped and non-primary read alignments
- accept only bases with Q-score $\ge$ ```base_qual_threshold``` 

In [21]:
mapping_stats = ngs.parse_sam(sorted_sampid, sorted_refnames, data_folder, 
                              base_qual_threshold, max_indel_abs, max_indel_prop, 
                              threadnum=threadnum, plt_show=0)
display (mapping_stats.head())

# Sort reads based on amplicons

For all gene segments other than the matrix and non-structural gene segments, we performed six independent PCR amplication of three overlapping amplicons covering the entire gene. Here, we sort the mapped reads to the amplicon template they were based from by identifying unique sites that were covered by each individual amplicon. 

In [22]:
ngs.sort_reads(sorted_sampid, primer_coords, gene_to_proteinorf, threadnum=threadnum)

# Tally base and codon counts

In [23]:
ngs.tally_bases(sorted_sampid, threadnum=threadnum)

# Variant calling

Other than the 2% frequency threshold, as per  (Illingworth, Bioinformatics, 2016), we compute a statistical threshold for a variant to be called. Suppose $q$ is  the minimum required base quality score, error rate will then be $p_e = 10^{-q/10}$. As such, if $n$ out of $N$ bases are called to a site, the probability that this event resulted from errors is modelled as: 

$p_{Err} = \sum_{i=n}^{N}{\begin{pmatrix}
N \\
i 
\end{pmatrix}p_{e}^{i}(1-p_e)^{N-i}}$

Variant is only called if $p_{Err}<0.01$.

In [24]:
variant_call_df = ngs.variant_calling(sorted_sampid, sorted_refnames, base_qual_threshold,
                                      min_cov, min_var_prop, gene_to_proteinorf, err_tol, 
                                      ha_numbering_conversion=ha_numbering_conversion, 
                                      HAnum_subtype=HAnum_subtype, threadnum=threadnum)
display (variant_call_df.head())

# Determine basis for breadth of coverage 

Due to the inconsistent coverage across gene segments for certain samples, instead of using an arbitrarily defined minimum breadth of coverage, we find all polymorphic sites with >2% minority variants in at least 2 samples. A gene segment of a sample is determined to be amply covered if >70% of the polymorphic sites were meet minimum coverage requirements (>100x). 

In [25]:
try: 
    variant_call_df = variant_call_df.reset_index()
except: 
    pass 

variant_call_df = variant_call_df.set_index('gene').sort_index()

all_polymorphic_sites = []

for gene in sorted_refnames: 
    
    # filter for >2% of each gene
    gene_vcf = variant_call_df.loc[gene].copy()
    gene_vcf = gene_vcf.drop_duplicates(['nucpos', 'nuc_var', 'sampid'])
    gene_vcf = gene_vcf[gene_vcf['nuc_prop']>=0.02]
    
    # get list of polymorphic sites and counts 
    polymorphic_sites = list(gene_vcf['nucpos'])
    polymorphic_sites = {nucpos:polymorphic_sites.count(nucpos) for nucpos in set(polymorphic_sites)}
    polymorphic_sites = pd.DataFrame.from_dict(polymorphic_sites.items())
    polymorphic_sites.columns = ['nucpos', 'count']
    
    # all nucpos where mutations were found in >2 samples  
    polymorphic_sites = polymorphic_sites[polymorphic_sites['count']>1]
    polymorphic_sites['gene'] = gene
    
    all_polymorphic_sites.append(polymorphic_sites[['gene', 'nucpos']])

all_polymorphic_sites = pd.concat(all_polymorphic_sites).set_index('gene').sort_index()
display (all_polymorphic_sites.head())

# Nucleotide coverage plots for each patient 

Here, we also determined which gene segment of each sample met the required coverage breadth. 

In [26]:
# standardise maximum y-value for plots 
ymax = -1
for sampid in sorted_sampid: 
    try: 
        map_nuc_results = pd.read_csv('./results/map_nuc_results_{}.csv'.format(sampid))
    except: 
        continue
    if map_nuc_results['Coverage'].max() > ymax: 
        ymax = map_nuc_results['Coverage'].max()
ymax = 10**int(np.ceil(np.log10(ymax)))

# average coverage is based on a sliding window of 50 bp with stepsize of 25 bp
sliding_window=50
stepsize=25
label_size = 12
color_scheme = ["#9e0142", "#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#66c2a5", "#3288bd", "#5e4fa2"][::-1]

# array of sorted segment length 
sorted_gene_len = np.array([influenza_gene_len[refname] for refname in sorted(influenza_gene_len.keys())])

# reindex meta_df based on subject_id and enrollment day 
meta_df = meta_df.reset_index().set_index(['subject_id', 'enrolD']).sort_index() 

# dataframe to plot overall distribution across all patients
overall_gene_coverage_distribution = [] 

for subject_id in sorted(set(meta_df.index.get_level_values(0))): 
    print (subject_id)
    
    subject_meta_df = meta_df.loc[subject_id]
    
    # initialise coverage plot figure for each subject 
    with plt.style.context("default"):
        fig = plt.figure(figsize=(11.7, 4.1))#, constrained_layout=True)
        spec = gridspec.GridSpec(1, 8, figure=fig, wspace=0.2, 
                                 width_ratios=sorted_gene_len/np.sum(sorted_gene_len))

        axes = [] # list of subplots (by segments)
        first_sample_bool = 1

        for enrolD in sorted(set(subject_meta_df.index)): 
            enrolD_subject_meta_df = subject_meta_df.loc[enrolD]

            try: 
                timepoint_label = "D%i"%(int(enrolD_subject_meta_df.timepoint))
            except: 
                timepoint_label = "T%i"%(enrolD) 

            sampid = enrolD_subject_meta_df['sampid']
            sample_type = enrolD_subject_meta_df['SampleType']

            # read map_nuc_results 
            if os.path.isfile('./results/map_nuc_results_{}.csv'.format(sampid)): 
                # parse coverage/site quality results    
                map_nuc_results = pd.read_csv('./results/map_nuc_results_{}.csv'.format(sampid), keep_default_na=False)
            else: 
                print ('No mapped reads found for %s'%sampid)
                continue 

            for _r, refname in enumerate(sorted(influenza_gene_len.keys())):

                rdf = map_nuc_results[map_nuc_results['Gene']==refname]

                refseq_len = influenza_gene_len[refname]
                gene_start_pos = 1
                gene_end_pos = gene_start_pos+refseq_len

                # step size 
                x_values = np.arange(gene_start_pos, gene_end_pos, stepsize)
                if x_values[-1] != gene_end_pos: 
                    x_values = np.append(x_values, gene_end_pos)

                # compute mean coverage over sliding_window sized bins 
                y_values = np.zeros(len(x_values)-1)
                mapdf = rdf[['Position', 'Coverage']].set_index('Position').sort_index()

                # compute breadth of coverage 
                breadth = []
                for idx, x_val in enumerate(x_values): 
                    if idx == 0:
                        continue 

                    pos_range = range(int(np.max([0., x_val-sliding_window])), x_val)

                    # compute mean coverage over 200bp bins 
                    try:
                        mean_coverage = np.mean(mapdf.loc[pos_range])['Coverage']
                    except: 
                        mean_coverage = np.zeros(len(pos_range))
                        for _p, p in enumerate(pos_range): 
                            try: 
                                mean_coverage[_p] = mapdf.loc[p]['Coverage']
                            except: 
                                continue 
                        mean_coverage = np.mean(mean_coverage)

                    y_values[idx-1] = mean_coverage

                    # breadth of coverage 
                    if mean_coverage >= min_cov: 
                        breadth += range(x_values[idx-1], x_val)

                    # store computed mean coverage 
                    overall_gene_coverage_distribution.append({'gene':refname, 'pos':x_val, 'coverage':mean_coverage})

                """
                # compute breadth of coverage 
                meta_df.loc[(subject_id, enrolD), refname] = len(breadth)/refseq_len
                """

                num_overlapping_polymorphic_sites = len(set(all_polymorphic_sites.loc[refname, 'nucpos'])&set(rdf[rdf['Coverage']>=min_cov]['Position']))
                meta_df.loc[(subject_id, enrolD), refname] = num_overlapping_polymorphic_sites/len(all_polymorphic_sites.loc[refname, 'nucpos']) 

                if first_sample_bool == 1:
                    # add subplot for 1st sample 
                    ax = fig.add_subplot(spec[0,_r])
                    ax.set_title(refname, fontsize=label_size) # title 

                    # plot min_cov line
                    ax.plot(x_values[1:], 
                            np.zeros(len(x_values)-1)+min_cov, 
                            color='k', linestyle='--')
                    axes.append(ax)
                else: 
                    ax = axes[_r]

                # plot coverage 
                # patch samples
                if re.search('_P$', sampid):
                    label = '{}-{} (patch)'.format(timepoint_label, sample_type)
                    ax.plot(x_values[1:],
                            y_values, '--',
                            color=color_scheme[enrolD-1],
                            label=label)
                else: 
                    label = '{}-{}'.format(timepoint_label, sample_type)
                    ax.plot(x_values[1:],
                            y_values, 
                            color=color_scheme[enrolD-1],
                            label=label)

            if first_sample_bool == 1:
                first_sample_bool = 0

        for _ax, ax in enumerate(axes):
            if _ax == 0: 
                ax.set_ylabel('Coverage')
                ax.yaxis.label.set_fontsize(label_size)
            else: 
                # remove y-axis label (sharey)
                ax.tick_params(labelleft=False)

            # gray facecolor for odd panels 
            if (_ax%2 != 0): 
                ax.set_facecolor(color='#d1d1d1')

            # remove left and right spines 
            ax.spines['left'].set_visible(False)
            ax.spines['right'].set_visible(False)

            # set xlim and xtick labels
            refname = sorted(influenza_gene_len.keys())[_ax]
            refseq_len = influenza_gene_len[refname]
            gene_start_pos = 1
            gene_end_pos = gene_start_pos+refseq_len

            if refseq_len > 2000: 
                ax.set_xticks(np.linspace(gene_start_pos,  gene_end_pos-1, 4))
                ax.set_xticklabels(map(int, np.linspace(gene_start_pos,  gene_end_pos-1, 4)))
            elif refseq_len > 1000: 
                ax.set_xticks(np.linspace(gene_start_pos,  gene_end_pos-1, 3))
                ax.set_xticklabels(map(int, np.linspace(gene_start_pos,  gene_end_pos-1, 3)))
            else: 
                ax.set_xticks(np.linspace(gene_start_pos, gene_end_pos-1, 2))
                ax.set_xticklabels(map(int, np.linspace(gene_start_pos,  gene_end_pos-1, 2)))

            # set ylim and yscale 
            ax.set_ylim((1, ymax))
            ax.set_yscale('symlog')

            # change tick size 
            ax.tick_params(axis='both', which='major', labelsize=label_size*0.8)

            # change axis size 
            ax.xaxis.label.set_fontsize(label_size)

        # x-axis label 
        fig.text(0.5, 0.01, 'Position', ha='center', fontsize=label_size)
        plt.legend(loc='center left',  bbox_to_anchor=(1, 0.5))
        #plt.tight_layout()
        plt.savefig('./results/figures/coverage_plots_{}.pdf'.format(subject_id), 
                    bbox_inches='tight', pad_inches=0.)
        plt.show()

# convert overall_gene_coverage_distribution to dataframe 
overall_gene_coverage_distribution = pd.DataFrame.from_dict(overall_gene_coverage_distribution)
overall_gene_coverage_distribution = overall_gene_coverage_distribution.set_index(['gene', 'pos']).sort_index()

# Plot overall coverage across all samples

In [27]:
# initialise coverage plot figure 
with plt.style.context("default"): 
    fig = plt.figure(figsize=(11.7, 4.1))#, constrained_layout=True)
    spec = gridspec.GridSpec(1, 8, figure=fig, wspace=0.2, 
                             width_ratios=sorted_gene_len/np.sum(sorted_gene_len))

    axes = [] # list of subplots (by segments)

    # add subplot for 1st sample 
    for _r, refname in enumerate(sorted_refnames): 
        ax = fig.add_subplot(spec[0,_r])
        ax.set_title(refname, fontsize=label_size) # title 

        # plot min_cov line
        Y_array = []
        X_array = np.array(sorted(set(overall_gene_coverage_distribution.loc[refname].index)))
        for x_val in X_array:
            Y_array.append(np.array(overall_gene_coverage_distribution.loc[(refname, x_val), 'coverage']))
        Y_array = np.array(Y_array)

        ax.plot(X_array, [100]*len(X_array), "--", color='#fcae91')

        mu = np.median(Y_array, axis=1)
        ax.plot(X_array, mu, color='#000000')
        ax.fill_between(X_array, np.quantile(Y_array, 0.25, axis=1), 
                        np.quantile(Y_array, 0.75, axis=1), facecolor='#ef3b2c', alpha=0.5)
        ax.fill_between(X_array, np.min(Y_array, axis=1), 
                        np.max(Y_array, axis=1), facecolor='#fcbba1', alpha=0.2)

        axes.append(ax)

    for _ax, ax in enumerate(axes):
        if _ax == 0: 
            ax.set_ylabel('Coverage')
            ax.yaxis.label.set_fontsize(label_size)
        else: 
            # remove y-axis label (sharey)
            ax.tick_params(labelleft=False)

        # gray facecolor for odd panels 
        if (_ax%2 != 0): 
            ax.set_facecolor(color='#d1d1d1')

        # remove left and right spines 
        ax.spines['left'].set_visible(False)
        ax.spines['right'].set_visible(False)

        # set xlim and xtick labels
        refname = sorted(influenza_gene_len.keys())[_ax]
        refseq_len = influenza_gene_len[refname]
        gene_start_pos = 1
        gene_end_pos = gene_start_pos+refseq_len

        if refseq_len > 2000: 
            ax.set_xticks(np.linspace(gene_start_pos,  gene_end_pos-1, 4))
            ax.set_xticklabels(map(int, np.linspace(gene_start_pos,  gene_end_pos-1, 4)))
        elif refseq_len > 1000: 
            ax.set_xticks(np.linspace(gene_start_pos,  gene_end_pos-1, 3))
            ax.set_xticklabels(map(int, np.linspace(gene_start_pos,  gene_end_pos-1, 3)))
        else: 
            ax.set_xticks(np.linspace(gene_start_pos, gene_end_pos-1, 2))
            ax.set_xticklabels(map(int, np.linspace(gene_start_pos,  gene_end_pos-1, 2)))

        # set ylim and yscale 
        ax.set_ylim((1, ymax))
        ax.set_yscale('symlog')

        # change tick size 
        ax.tick_params(axis='both', which='major', labelsize=label_size*0.8)

        # change axis size 
        ax.xaxis.label.set_fontsize(label_size)

    # x-axis label 
    fig.text(0.5, 0.01, 'Position', ha='center', fontsize=label_size)
    #plt.tight_layout()
    plt.savefig('./results/figures/coverage_plots_overall.pdf', 
                bbox_inches='tight', pad_inches=0.)
    plt.show()

    # save meta_df with coverage breadth to results 
    meta_df.to_csv('./results/metadata_w_covbreadth.csv')