In [None]:
threshold = 50
fastp = 0

### Date of Analysis

In [None]:
import datetime
now = datetime.datetime.now().strftime("%Y%m%d")
print("Date subfix:", now)

### Samples subjected to Shigella serotyping analysis:

In [None]:
# identify sample names for the pair-end reads
import glob
import os
import pandas as pd 
files = glob.glob("*.fastq.gz")
names = []
for file in files:
    names.append(file[:file.find("_")])
samples = list(set(names))
samples.sort()
sizes = []
for Sample in samples:
    size = 0
    for file in files:
        if file.startswith(Sample+"_"):
            size += os.path.getsize(file)
    sizes.append(round(size/1024000, 1))

sizetable = pd.DataFrame({'Sample': samples, 'Size (MB)': sizes})

In [None]:
# 
ShigellaRef = "../../references/ShigellaRef5.fasta"
if os.path.isfile(ShigellaRef) == False:
    print("Error: reference sequence database does not exist!")
    exit()
# generate a index file for reference
dir_path = os.path.dirname(os.path.realpath(ShigellaRef))
rel_dir = os.path.relpath(dir_path, os.getcwd())
mmi_index = os.path.join(rel_dir, "ShigellaRef5.mmi")
if os.path.isfile(mmi_index) == False:
    print("building Reference sequence index.......")
    !minimap2 -d $mmi_index $ShigellaRef
# another index needed to generate
fai_index = os.path.join(rel_dir, "ShigellaRef5.fasta.fai")
if os.path.isfile(fai_index) == False:
    !samtools faidx $ShigellaRef

In [None]:
import datetime
now = datetime.datetime.now().strftime("%Y%m%d")
import papermill as pm
outputs = []
for Sample in samples:
    reads = []
    for file in files:
        if file.startswith(Sample+"_"):
            reads.append(file)
    if len(reads) == 2:
        output = Sample + "_" + str(threshold) + "_" + now + ".ipynb"
        outputs.append(output); print(" ")
        pm.execute_notebook('batch_102618.ipynb', output,
                            dict(Sample=Sample, read1 = reads[0], read2 = reads[1], fastp = fastp, threshold = threshold)) 
        !jupyter nbconvert --to html $output
    else: print('\n'+Sample + " does not have pair-end reads! Please re-examine.")

### Summary of serotype prediction results:

In [None]:
print("Date of analysis:", now)
print("Threshold level for gene coverage: ", threshold, "%")
#yesno = ['Yes', 'No']
#print("Quality inspection conducted: ", yesno[fastp])
#print("Html report of analysis generated for each sample: ", yesno[rep])
print(len(outputs), " samples were analyzed: \n")
samples = []
predictions = []
for output in outputs:
    nb = pm.read_notebook(output)
    samples.append(nb.dataframe.iloc[0]['value'])
    predictions.append(nb.dataframe.iloc[5]['value'])
  
table = pd.DataFrame({'Sample': samples, 'Prediction': predictions})
final = pd.merge(left=sizetable, right = table, left_on="Sample", right_on = "Sample", how = 'left')
from IPython.display import display, HTML
display(HTML(final.to_html(index=False)))
#print(nb.dataframe.iloc[3, 1], '\n')

In [None]:
from IPython.display import HTML
HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
The raw code for this notebook is by default hidden for easier reading.
To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')