-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrix.py
executable file
·77 lines (64 loc) · 2.04 KB
/
matrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import numpy as np
import pandas as pd
def create_matrix(file,best_k):
fastq_filehandle = open(file, "r")
# Start with an empty dictionary
counts = {}
seq=0
# Loop over each line in the file
line = ''
seq_id="seq_{}".format(seq)
for row in fastq_filehandle:
# Keep the rows with data
if ">" not in row:
line = line + row.strip()
else:
if seq != 0:
counts =count_kmers(seq_id,line,best_k,counts)
seq=seq+1
seq_id="seq_{}".format(seq)
counts[seq_id]={}
line = ''
counts =count_kmers(seq_id,line,best_k,counts)
matrix_data = pd.DataFrame(counts)
fastq_filehandle.close
# Create info file
info = open("results/matrix-{}.txt".format(best_k),'a')
info.write(matrix_data.to_string(na_rep='0'))
info.close()
return
def count_kmers(id,read,k,counts):
"""Count kmer occurrences in a given sequence.
Parameters
----------
id: number of sequence.
read : string
A single DNA sequence.
k : int
The value of k for which to count kmers.
counts : dictionary, {'string': int}
A dictionary of counts keyed by their individual kmers (strings
of length k).
Returns
-------
counts : dictionary, {'string': int}
A dictionary of counts keyed by their individual kmers (strings
of length k).
Examples
--------
>>> count_kmers("GATGAT", 3)
{'ATG': 1, 'GAT': 2, 'TGA': 1}
"""
# Calculate how many kmers of length k there are
num_kmers = len(read) - k + 1
# Loop over the kmer start positions
for i in range(num_kmers):
# Slice the string to get the kmer
kmer = read[i:i+k]
# Add the kmer to the dictionary if it's not there
if kmer not in counts[id]:
counts[id][kmer] = int(0)
# Increment the count for this kmer
counts[id][kmer] =int(counts[id][kmer])+ int(1)
# Return the final counts
return counts