In [1]:
import pandas as pd

## Assemble PacBio reads with Canu:

```canu -p ecoli -d ecoli-pacbio4 stopOnLowCoverage=6 genomeSize=4.8m useGrid=false -pacbio-raw ../../NGS/6/pacbio_10x.fq.gz ```

## Assemble Illumina reads with SPAdes:

```spades.py -k 21,33,55,77 --careful --pe1-1 ../../NGS/6/illumina.100x.1.fq.gz --pe1-2 ../../NGS/6/illumina.100x.2.fq.gz -o illumina_assembly```

# QUAST assembly evaluation

In [2]:
quast_report = pd.read_csv('quast_results/latest/report.tsv', delimiter='\t')

## Basic statistics

### Contig sizes

In [6]:
quast_report.where(quast_report.Assembly.isin(['# contigs',
                                               '# contigs (>= 1000 bp)',
                                               'Largest contig',
                                               'Total length'])).dropna()

Unnamed: 0,Assembly,pacbio10x,pacbio20x,pacbio40x,pacbio80x,illuminaK21,illuminaK33,illuminaK55,illuminaK77
1,# contigs (>= 1000 bp),132,24,3,1,593,306,133,73
12,# contigs,132,24,3,1,693,328,147,81
13,Largest contig,180610,749388,4640801,4652986,44931,81459,187502,315011
14,Total length,3817958,4606872,4656609,4652986,4470814,4514267,4537483,4561340


![gc1](pics/cumulative_plot.png)

### Nx and Lx

In [7]:
quast_report.where(quast_report.Assembly.isin(['N50',
                                               'N75',
                                               'L50',
                                               'L75'])).dropna()

Unnamed: 0,Assembly,pacbio10x,pacbio20x,pacbio40x,pacbio80x,illuminaK21,illuminaK33,illuminaK55,illuminaK77
18,N50,47884,273158,4640801,4652986,10524,25441,61808,133059
20,N75,26677,199297,4640801,4652986,6493,12671,39099,66190
22,L50,27,5,1,1,128,56,21,12
24,L75,52,10,1,1,267,120,43,24


![gc1](pics/Nx_plot.png)

### GC content and Ns

In [10]:
quast_report.where(quast_report.Assembly.isin(['GC (%)',
                                               '# N\'s per 100 kbp'])).dropna()

Unnamed: 0,Assembly,pacbio10x,pacbio20x,pacbio40x,pacbio80x,illuminaK21,illuminaK33,illuminaK55,illuminaK77
16,GC (%),50.5,50.76,50.76,50.79,50.72,50.75,50.75,50.74
37,# N's per 100 kbp,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### _По базовым статистикам сборка прочтений pacbio с максимальным покрытием оказалась самой лучшей - потому что они собрались в один контиг_

## Reference-based metrics

### NGx and LGx

In [11]:
quast_report.where(quast_report.Assembly.isin(['NG50',
                                               'NG75',
                                               'LG50',
                                               'LG75'])).dropna()

Unnamed: 0,Assembly,pacbio10x,pacbio20x,pacbio40x,pacbio80x,illuminaK21,illuminaK33,illuminaK55,illuminaK77
19,NG50,42731,273158,4640801,4652986,10092,24744,61808,133059
21,NG75,12082,199297,4640801,4652986,5762,11450,35764,64443
23,LG50,36,5,1,1,136,58,21,12
25,LG75,85,10,1,1,288,128,45,25


### Alignment statistics

In [14]:
quast_report.where(quast_report.Assembly.isin(['Genome fraction (%)',
                                               'Total length','Reference length',
                                               'Duplication ratio',
                                               'Largest alignment', '# mismatches per 100 kbp',
                                               'Total aligned length', '# indels per 100 kbp'])).dropna()

Unnamed: 0,Assembly,pacbio10x,pacbio20x,pacbio40x,pacbio80x,illuminaK21,illuminaK33,illuminaK55,illuminaK77
14,Total length,3817958.0,4606872.0,4656609.0,4652986.0,4470814.0,4514267.0,4537483.0,4561340.0
15,Reference length,4639675.0,4639675.0,4639675.0,4639675.0,4639675.0,4639675.0,4639675.0,4639675.0
35,Genome fraction (%),82.135,98.615,99.994,99.998,96.227,97.215,97.743,98.271
36,Duplication ratio,1.002,1.007,1.004,1.003,1.001,1.001,1.0,1.0
38,# mismatches per 100 kbp,48.81,6.05,1.23,0.39,0.09,0.07,0.11,1.16
39,# indels per 100 kbp,677.23,151.42,34.16,11.29,0.07,0.07,0.11,0.37
40,Largest alignment,149583.0,514798.0,3022595.0,3026092.0,44931.0,81428.0,187502.0,315011.0
41,Total aligned length,3817753.0,4606854.0,4653329.0,4652984.0,4470100.0,4513480.0,4536582.0,4560081.0


### Missaemblies

In [15]:
quast_report.where(quast_report.Assembly.isin(['# misassemblies',
                                               '# misassembled contigs',
                                               'Misassembled contigs length',
                                               '# local misassemblies'])).dropna()

Unnamed: 0,Assembly,pacbio10x,pacbio20x,pacbio40x,pacbio80x,illuminaK21,illuminaK33,illuminaK55,illuminaK77
26,# misassemblies,11,8,8,8,0,0,0,0
27,# misassembled contigs,7,3,1,1,0,0,0,0
28,Misassembled contigs length,345889,1187160,4640801,4652986,0,0,0,0
29,# local misassemblies,1,2,2,2,0,0,4,9


### _По статистикам, основанным на референсе, у сборки прочтений illumina оказалось меньше ошибок в выравнивании, но и длина покрытого генома меньше. По missassemblies (как это перевести-то?) прочтения illumina тоже оказались лучше, но кажется, что сравнивать длины и количества плохих контигов неправильно, учитывая, что из хороших прочтений pacbio получилось 1-3 контига_