Initial looks at ORF redundancy to consider clustering ORFs before running mica

In [1]:
import pandas as pd
import os.path as op
import os
import shutil
from collections import Counter, defaultdict
from itertools import groupby

def read_fasta(file_handle):
    '''Fasta iterator'''
    for header, group in groupby(file_handle, lambda line: line[0] == '>'):
        if header:
            line = next(group)
            name = line[1:].strip()
        else:
            seq = ''.join(line.strip() for line in group)
            yield name, seq

In [17]:
os.chdir("/mnt/scgc/simon/simonsproject/bats248_annotations")

In [12]:
seqs = set()
seqdict = defaultdict(lambda:[])
count = 0

for name, seq in read_fasta(open("../bats248_all_orfs.faa")):
    count += 1
    seqdict[seq].append(name)

In [13]:
count

4374770

In [14]:
len(seqdict)

2862864

In [16]:
(count - len(seqdict)) / count

0.3455966827970385

In [19]:
with open("bats248_orfs_nonidentical.faa", "w") as oh:
    for seq in seqdict:
        name = "__".join(seqdict[seq])
        print(">{}".format(name), file=oh)
        for i in range(0, len(seq), 60):
            print(seq[i:i+60], file=oh)

In [20]:
df = pd.read_csv("/mnt/scgc/simon/simonsproject/mash/171206_MASH_cluster_categories.csv")

In [22]:
Counter(df['d90_group']).most_common()

[(3, 517),
 (9, 274),
 (5, 240),
 (15, 227),
 (16, 217),
 (33, 185),
 (11, 133),
 (24, 130),
 (35, 122),
 (34, 109),
 (13, 102),
 (27, 100),
 (6, 90),
 (81, 87),
 (10, 83),
 (67, 80),
 (44, 71),
 (31, 58),
 (75, 54),
 (18, 52),
 (58, 50),
 (4, 48),
 (8, 48),
 (12, 46),
 (55, 46),
 (61, 45),
 (129, 45),
 (17, 44),
 (1, 42),
 (29, 38),
 (20, 36),
 (94, 36),
 (28, 33),
 (101, 32),
 (74, 31),
 (104, 29),
 (43, 28),
 (76, 27),
 (88, 26),
 (32, 23),
 (109, 23),
 (145, 23),
 (92, 22),
 (113, 22),
 (210, 22),
 (70, 21),
 (125, 21),
 (71, 20),
 (136, 20),
 (207, 20),
 (89, 19),
 (148, 19),
 (14, 18),
 (41, 18),
 (116, 18),
 (107, 17),
 (150, 17),
 (38, 16),
 (49, 16),
 (72, 16),
 (87, 16),
 (46, 15),
 (60, 15),
 (62, 15),
 (82, 15),
 (106, 15),
 (19, 14),
 (45, 14),
 (130, 14),
 (226, 14),
 (238, 14),
 (56, 13),
 (115, 13),
 (137, 13),
 (160, 13),
 (22, 12),
 (47, 12),
 (52, 12),
 (69, 12),
 (108, 12),
 (122, 12),
 (134, 12),
 (63, 11),
 (111, 11),
 (123, 11),
 (156, 11),
 (230, 11),
 (50, 10),

In [23]:
d90dir = 'mash_d90'
if op.exists(d90dir) == False:
    os.mkdir(d90dir)

Going to prototype 1 group to see how fast reciprocal BLAST goes...

That'll be group 6 with 90 members.

In [24]:
subdf = df[df['d90_group'] == 6]

In [28]:
with open(op.join(d90dir, "d90_group6_orfs.fasta"), "w") as oh:
    for s in subdf['sample']:
        for name, seq in read_fasta(open(op.join("faa","{}.faa".format(s)))):
            print(">{}".format(name), file = oh)
            for i in range(0, len(seq), 60):
                print(seq[i:i+60], file=oh)

In [30]:
!module load diamond/0.9.10

In [33]:
!/mnt/scgc_nfs/opt/common/diamond/0.9.10/diamond makedb --in {op.join(d90dir, 'd90_group6_orfs.fasta')} -d mash_d90/d90_group6 -p 5

diamond v0.9.10.111 | by Benjamin Buchfink <buchfink@gmail.com>
Licensed under the GNU AGPL <https://www.gnu.org/licenses/agpl.txt>
Check http://github.com/bbuchfink/diamond for updates.

#CPU threads: 5
Scoring parameters: (Matrix=BLOSUM62 Lambda=0.267 K=0.041 Penalties=11/1)
Database file: mash_d90/d90_group6_orfs.fasta
Opening the database file...  [0.035421s]
Loading sequences...  [0.173223s]
Masking sequences...  [1.6397s]
Writing sequences...  [0.064997s]
Loading sequences...  [7.2e-05s]
Writing trailer...  [0.002576s]
Closing the input file...  [0.00077s]
Closing the database file...  [0.080991s]
Processed 62758 sequences, 19882031 letters.
Total time = 1.99834s


In [35]:
!echo "/mnt/scgc_nfs/opt/common/diamond/0.9.10/diamond blastp -d mash_d90/d90_group6 -q {op.join(d90dir, 'd90_group6_orfs.fasta')} --more-sensitive -f 6 -k 0 --id 97 --query-cover 100 --subject-cover 100 --no-self-hits -o bats248_orfs_recip_diamond.out'|qsub "

/mnt/scgc_nfs/opt/common/diamond/0.9.10/diamond blastp -d mash_d90/d90_group6 -q mash_d90/d90_group6_orfs.fasta --more-sensitive -f 6 -k 0 --id 97 --query-cover 100 --subject-cover 100 --no-self-hits -o bats248_orfs_recip_diamond.out'|qsub 


In [42]:
cmd = ' echo "/mnt/scgc_nfs/opt/common/diamond/0.9.10/diamond blastp -d /mnt/scgc/simon/simonsproject/bats248_annotations/mash_d90/d90_group6 -q /mnt/scgc/simon/simonsproject/bats248_annotations/mash_d90/d90_group6_orfs.fasta -p 60 --more-sensitive -f 6 -k 0 --id 97 --query-cover 100 --subject-cover 100 --no-self-hits -o /mnt/scgc/simon/simonsproject/bats248_annotations/d90_group6_orfs_recip_diamond.out"'

In [43]:
sub = "qsub -N g6d -q scgc-route -V -l walltime=20:00:00,ncpus=60 -W umask=0002 -j oe -o ~/out/171208_g6_diamond"

In [44]:
print(cmd,sub, sep="|")

 echo "/mnt/scgc_nfs/opt/common/diamond/0.9.10/diamond blastp -d /mnt/scgc/simon/simonsproject/bats248_annotations/mash_d90/d90_group6 -q /mnt/scgc/simon/simonsproject/bats248_annotations/mash_d90/d90_group6_orfs.fasta -p 60 --more-sensitive -f 6 -k 0 --id 97 --query-cover 100 --subject-cover 100 --no-self-hits -o /mnt/scgc/simon/simonsproject/bats248_annotations/d90_group6_orfs_recip_diamond.out"|qsub -N g6d -q scgc-route -V -l walltime=20:00:00,ncpus=60 -W umask=0002 -j oe -o ~/out/171208_g6_diamond


In [47]:
names = ['query', 'subject', 'pct_id', 'aln_len', 'mismatch', 'gaps', 'qstart', 'qend', 'sstart', 'send', 'eval', 'bitscore']
bdf = pd.read_csv("mash_d90/d90_group6_orfs_recip_diamond.out", sep="\t", names=names)

In [2]:
import networkx as nx
import random
import community

def get_communities(row1, row2):
    nodes = set(row1 + row2)
    g = nx.Graph()
    g.add_nodes_from(nodes)
    g.add_edges_from([(i, j) for i, j in zip(row1, row2)])
    partition = community.best_partition(g)
    return partition

In [57]:
partition = get_communities(list(bdf['query'].values), list(bdf['subject'].values))

In [58]:
pgroups = defaultdict(lambda:[])
for k in partition:
    pgroups[partition[k]].append(k)

In [68]:
Counter([partition[i] for i in partition]).most_common()

[(625, 40),
 (1498, 40),
 (742, 39),
 (937, 39),
 (166, 38),
 (784, 38),
 (1142, 38),
 (300, 37),
 (351, 37),
 (579, 37),
 (761, 37),
 (873, 37),
 (202, 36),
 (206, 36),
 (417, 36),
 (433, 36),
 (943, 36),
 (994, 36),
 (2075, 36),
 (2106, 36),
 (2152, 36),
 (76, 35),
 (146, 35),
 (191, 35),
 (343, 35),
 (585, 35),
 (663, 35),
 (845, 35),
 (853, 35),
 (940, 35),
 (947, 35),
 (1018, 35),
 (1277, 35),
 (1346, 35),
 (1421, 35),
 (1425, 35),
 (1506, 35),
 (1534, 35),
 (1939, 35),
 (2327, 35),
 (2378, 35),
 (2803, 35),
 (2981, 35),
 (37, 34),
 (151, 34),
 (213, 34),
 (252, 34),
 (307, 34),
 (398, 34),
 (479, 34),
 (506, 34),
 (660, 34),
 (700, 34),
 (890, 34),
 (936, 34),
 (966, 34),
 (1031, 34),
 (1062, 34),
 (1227, 34),
 (1433, 34),
 (1481, 34),
 (1720, 34),
 (2049, 34),
 (2063, 34),
 (2210, 34),
 (2295, 34),
 (2345, 34),
 (2381, 34),
 (2789, 34),
 (2814, 34),
 (26, 33),
 (28, 33),
 (33, 33),
 (38, 33),
 (56, 33),
 (183, 33),
 (237, 33),
 (255, 33),
 (286, 33),
 (298, 33),
 (333, 33),
 (38

In [60]:
len(partition)

56569

In [61]:
pgroups

defaultdict(<function __main__.<lambda>>,
            {0: ['AG-898-A22_00129',
              'AG-900-C19_00102',
              'AG-907-G20_00451',
              'AG-913-P07_00811',
              'AG-920-O09_00278',
              'AG-894-P08_00347',
              'AG-913-J18_00572',
              'AG-895-G19_00754',
              'AG-917-I09_00319'],
             1: ['AG-900-C19_01029',
              'AG-901-M03_00143',
              'AG-913-J18_00090',
              'AG-905-J04_00039',
              'AG-907-G20_00693',
              'AG-898-A22_00753',
              'AG-919-I06_00787'],
             2: ['AG-900-E02_00328',
              'AG-891-F02_00582',
              'AG-905-M09_00525',
              'AG-915-J13_00228',
              'AG-912-I20_00323',
              'AG-901-D06_00050',
              'AG-908-A14_00268',
              'AG-899-I17_00260',
              'AG-892-K16_00160',
              'AG-907-I11_00232',
              'AG-909-B19_00844',
              'AG-892-D09_003

In [70]:
62758 - 56569 + len(pgroups)

11515

In [67]:
len(nodes)

56569

In [65]:
!grep -c ">" ./mash_d90/d90_group6_orfs.fasta

62758


In [9]:
import pyfaidx

In [72]:
f = pyfaidx.Fasta("./mash_d90/d90_group6_orfs.fasta")

In [74]:
no_match = []

for name in f.keys():
    if name not in nodes:
        no_match.append(name)

In [78]:
no_match[0]

'AG-893-J18_00127'

In [76]:
len(no_match)

6189

In [77]:
62758 - 56569

6189

In [None]:
def write_key(no_match, partition, outfile):
    with open(outfile, "w") as oh:
        for n in no_match:
            print(n, "no_group", sep=",", file)

In [3]:
all_blast_compf = "/mnt/scgc/simon/simonsproject/bats248_annotations/bats248_orfs_recip_diamond.out"

In [4]:
names = ['query', 'subject', 'pct_id', 'aln_len', 'mismatch', 'gaps', 'qstart', 'qend', 'sstart', 'send', 'eval', 'bitscore']
bdf = pd.read_csv(all_blast_compf, sep="\t", names=names)

In [5]:
partition = get_communities(list(bdf['query'].values), list(bdf['subject'].values))

In [6]:
pgroups = defaultdict(lambda:[])
for k in partition:
    pgroups[partition[k]].append(k)

In [7]:
len(pgroups)

562092

In [8]:
len(partition)

2786137

In [10]:
f = pyfaidx.Fasta("/mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs.faa")

In [None]:
nodes = list(set(list(bdf['query'].values) + list(bdf['subject'].values)))

In [None]:
no_match = []

for name in f.keys():
    if name not in nodes:
        no_match.append(name)