# biopython
- [url](https://github.com/biopython/biopython)
- FASTQ
- FASTA
- SAM
- BAM
- VCF
- BED

## FASTQ

In [1]:
def _opener(filename):
    if filename.endswith('.gz'):
        import gzip
        return gzip.open
    elif filename.endswith('.bz2'):
        import bz2
        return bz2.open
    elif filename.endswith('.lzma'):
        import lzma
        return lzma.open
    else:
        return open
    

In [3]:
# FASTQ by Biopython

from Bio.SeqIO.QualityIO import FastqPhredIterator, FastqPhredWriter
from Bio.SeqRecord import SeqRecord

def head(fpath, h = 10) :
    openfun = _opener(fpath)
    if openfun is not None :
        with openfun(fpath) as fh:
            try :
                it = FastqPhredIterator(fh)
                for i, r in enumerate(it) :
                    if i > 10:
                        break
                    else:
                        print("i={}, r={}".format(i,r))
                              
            except Exception as e:
                    print("Error : {}".format(str(e)))
                    
                    
                              
def gen(fpath, num = 100):
    openfun = _opener(fpath)
    if openfun is not None :
        with openfun(fpath) as fh:
            it = FastqPhredIterator(fh)
            for i, r in enumerate(it) :
                if i < num :
                    yield  r
                else :
                    break
        
            
def sample(src, frac = 0.1) :
    import random
    percent = int(frac * 100)
    for r in src:
        if random.randrange(1,101) < percent:
            yield r
"""
    - "qual" means simple quality files using PHRED scores (e.g. from Roche 454)
    - "fastq" means Sanger style FASTQ files using PHRED scores and an ASCII
      offset of 33 (e.g. from the NCBI Short Read Archive and Illumina 1.8+).
      These can potentially hold PHRED scores from 0 to 93.
    - "fastq-sanger" is an alias for "fastq".
    - "fastq-solexa" means old Solexa (and also very early Illumina) style FASTQ
      files, using Solexa scores with an ASCII offset 64. These can hold Solexa
      scores from -5 to 62.
    - "fastq-illumina" means newer Illumina 1.3 to 1.7 style FASTQ files, using
      PHRED scores but with an ASCII offset 64, allowing PHRED scores from 0
      to 62.
      
>>> from Bio import SeqIO
>>> for record in SeqIO.parse("Quality/example.fastq", "fastq"):
...     print("%s %s" % (record.id, record.seq))

"""
def output(fpath, src) :
    with _opener(fpath)(fpath, "w") as out :
        writer = FastqPhredWriter(out)
        writer.write_header()
    
        for r in src:
            #print(r)
            writer.write_record(r)
            
def stat(it) :
    import collections
    
    index = collections.defaultdict(int)
    for r in it:
        index[r.seq[0:4]]+=1
    print("size={}".format(len(index)))
    for k in index.keys() :
        print("key={},v={}".format(k,index[k]))
        
    
src = gen("SRR3503015.fastq", 1000)
stat(src)
#output("srr3503015_sample.fastq", src)

#head("SRR3503015.fastq")

size=218
key=ATTC,v=3
key=TGCG,v=3
key=TGGC,v=7
key=TGCC,v=7
key=CCTG,v=19
key=TCCA,v=9
key=CCTT,v=10
key=GGTG,v=14
key=CAGT,v=8
key=ACAC,v=4
key=TGAC,v=2
key=GTTT,v=5
key=CTGA,v=3
key=TCTT,v=6
key=ACAA,v=6
key=CCCA,v=23
key=TGTA,v=7
key=CTGC,v=5
key=TCCG,v=3
key=CCAA,v=12
key=ACTA,v=1
key=TCAA,v=8
key=CCCT,v=21
key=CCCG,v=4
key=TGGA,v=7
key=CTAC,v=3
key=TAAG,v=5
key=CATT,v=3
key=GTCT,v=2
key=TTGT,v=3
key=ACCT,v=6
key=ACAG,v=5
key=GCAC,v=6
key=CACT,v=5
key=GCAT,v=4
key=CAAG,v=9
key=GGCT,v=9
key=TACC,v=6
key=CACA,v=8
key=ACTT,v=5
key=TGAG,v=10
key=ACTG,v=5
key=CCAG,v=28
key=CAGA,v=6
key=CCAC,v=11
key=CTTG,v=5
key=ATTT,v=3
key=TGTT,v=12
key=CCGT,v=2
key=TTCA,v=3
key=GGTT,v=6
key=ATCT,v=1
key=CGGC,v=5
key=CTTT,v=11
key=TCTG,v=7
key=AGAA,v=10
key=TACT,v=3
key=TAGG,v=8
key=GCCT,v=10
key=AATT,v=4
key=AGCA,v=3
key=TGTC,v=7
key=TCTA,v=2
key=TCAT,v=7
key=GCAA,v=5
key=CAAA,v=4
key=AACA,v=3
key=GCAG,v=8
key=CCTC,v=20
key=CTAT,v=7
key=TGAT,v=12
key=CGAG,v=1
key=GGAA,v=10
key=ACAT,v=4
key=CAGC,v=5




In [60]:
from Bio import SeqIO
f = SeqIO.parse("SRR3503015.fastq", "fastq")
r = next(f)
print(r)
ano = r.letter_annotations
print(r.letter_annotations["phred_quality"])
for i in ano:
    print(i)

ID: SRR3503015.1
Name: SRR3503015.1
Description: SRR3503015.1 1 length=194
Number of features: 0
Per letter annotation for: phred_quality
Seq('ATTCATTCAAGAAATGTTAAACATGCAAATTTTTAGCACATGTATATGTAATAT...ATT', SingleLetterAlphabet())
[25, 35, 35, 37, 39, 39, 39, 39, 39, 41, 40, 40, 41, 40, 39, 41, 37, 34, 40, 39, 39, 40, 41, 34, 37, 39, 39, 40, 40, 40, 39, 40, 36, 38, 38, 37, 39, 39, 40, 40, 38, 40, 38, 35, 39, 40, 41, 40, 37, 37, 38, 38, 40, 29, 37, 39, 38, 39, 39, 36, 39, 39, 40, 37, 37, 39, 40, 37, 39, 37, 37, 34, 34, 39, 39, 40, 40, 35, 36, 39, 38, 40, 40, 41, 39, 39, 38, 37, 34, 36, 37, 33, 35, 36, 37, 37, 37, 32, 35, 35, 35, 39, 39, 39, 39, 39, 40, 37, 39, 39, 39, 38, 40, 38, 40, 39, 39, 37, 39, 40, 38, 39, 40, 41, 39, 39, 39, 38, 40, 41, 41, 40, 38, 35, 34, 40, 37, 39, 40, 40, 36, 39, 40, 35, 37, 39, 40, 36, 39, 38, 39, 40, 40, 40, 40, 35, 31, 37, 39, 38, 38, 39, 40, 38, 38, 38, 40, 40, 40, 41, 41, 41, 41, 34, 38, 39, 40, 39, 36, 34, 39, 39, 38, 39, 37, 37, 37, 37, 35, 31, 36, 34, 3

In [1]:
import biopython

ModuleNotFoundError: No module named 'biopython'

In [82]:
! cat srr3503015_sample.fastq

@SRR3503015.1 1 length=194
ATTCATTCAAGAAATGTTAAACATGCAAATTTTTAGCACATGTATATGTAATATTCACTTTACTACATATTGCAATGTACTTGAAGCTTTTATATTAGAAGACTTAATAATGAATGCTAATACCTTTCCTATATGAAGGATTTTTCATGATAGACATTGAAATATGAAATTTTGATGGTACTATTATTCCCATT
+
:DDFHHHHHJIIJIHJFCIHHIJCFHHIIIHIEGGFHHIIGIGDHIJIFFGGI>FHGHHEHHIFFHIFHFFCCHHIIDEHGIIJHHGFCEFBDEFFFADDDHHHHHIFHHHGIGIHHFHIGHIJHHHGIJJIGDCIFHIIEHIDFHIEHGHIIIID@FHGGHIGGGIIIJJJJCGHIHECHHGHFFFFD@ECEE
@SRR3503015.2 2 length=194
TGCGCCCTGCAGCGGCAGCAGGCGACGGCAGCCGAGAAGCCCTGGGATGCCAAGGCCCCCGAGGGCCGGCTGCGGAAGTGTGAGAGCACCGACTCGGCCCGGTCTTCCGGGGCCGCACGTTGTCCAGCTGGGCATCGTCCACCACCGCCTGGTTGTGGGAGATGAGCTGGGCGATGCGCTCCTCCAGCCGCTTC
+
DDEDHHHHHIJJJJJIJJJIHIGGGJHGEDDCC>B?BD<CBDCBDDDDDDDDDBDDDBBB;775BDB9BB><@<B<@B:A:ACD@C<AAA<<><@8?BDFFHHHHHJJHIJJJJIJJJJHIIJJJJHJJHHHHFEFFCCDDDDBBDBDD;BD:AD7?BDBD;ACCDDCDDDBBB<>BDB>BDCCD<ACDDD599
@SRR3503015.3 3 length=194
TGGCAGTCATAGTATGGTTCTTCTACCTGAATCTCTTGATCTTCACCTTCTTCTGGGTCTTCAATTCCCACACCGTCAGGCTCGTCGGCATCTCCCTGGCCGAGCAGCCAAATGGGGATGCTGATG