Initial clustering and mica runs for BATS248

In [1]:
import pandas as pd
import os
import os.path as op
import shutil

In [2]:
df = pd.read_csv("/mnt/scgc/simon/simonsproject/info/BATS248_keepers_coassembly_stats_171221.csv")

In [3]:
for i in ["faa","gff"]:
    if op.exists(op.join('/mnt/scgc/simon/simonsproject/bats248_annotations/',i)) == False:
        os.mkdir(op.join('/mnt/scgc/simon/simonsproject/bats248_annotations/',i))

count = 0
for i, l in df.iterrows():
    sag = l['sample']
    plate = "-".join(l['sample'].split("-")[:2])
    faa = op.join("/mnt/scgc/simon/results/postlocos/stepanauskas_2016_simons/",plate,sag,"{}_functional_annotation/Prokka/".format(sag),"{}.faa".format(sag))
    gff = op.join("/mnt/scgc/simon/results/postlocos/stepanauskas_2016_simons/",plate,sag,"{}_functional_annotation/Prokka/".format(sag),"{}.gff".format(sag))
    if op.exists(faa):
        count += 1
        shutil.copy(faa, "/mnt/scgc/simon/simonsproject/bats248_annotations/faa/")
        shutil.copy(gff, "/mnt/scgc/simon/simonsproject/bats248_annotations/gff/")
print(count, "faa files copied over")

!cat /mnt/scgc/simon/simonsproject/bats248_annotations/faa/*.faa > /mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs.fasta

In [4]:
in_fasta = '/mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs.fasta'
out_fasta = '/mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs_cdhit_c90.fasta'

In [5]:
cmd = "cd-hit -i {in_fa} -p 1 -c 0.90 -o {out_fa} -T 50 -M 100000".format(in_fa = in_fasta, out_fa = out_fasta)
outlog = "/home/julia/out/171229_cdhit.out"

print("echo '{cmd}' | qsub -N cdhit -q scgc-route -V -l walltime=3:00:00,ncpus=50,mem=150G -j oe -o {log}".format(cmd = cmd, log=outlog))


echo 'cd-hit -i /mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs.fasta -p 1 -c 0.90 -o /mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs_cdhit_c90.fasta -T 50 -M 100000' | qsub -N cdhit -q scgc-route -V -l walltime=3:00:00,ncpus=50,mem=150G -j oe -o /home/julia/out/171229_cdhit.out


After that's done, we'll cluster the 90% ID clusters to 70% ID, then split them up by cluster into groups ~1000 sequences per file and start a massive array submission.

In [6]:
from nb_tools import run_cd_hit, cluster_map, readfa
import numpy as np
from collections import defaultdict
import itertools
import fileinput

In [7]:
out70_fasta = out_fasta.replace(".fasta","_c70.fasta")
out70_fasta

'/mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs_cdhit_c90_c70.fasta'

In [8]:
outlist = run_cd_hit(out_fasta, out70_fasta, T=2, c=.70)

Running CD-HIT on /mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs_cdhit_c90.fasta


In [11]:
fa, clstr = outlist

In [24]:
singles = True
cluster_map = defaultdict(list)
with open(clstr) as fh:
    for cluster_start, group in itertools.groupby(fh, lambda l: l[0] == '>'):
        members = []
        rep_seq = ''
        if not cluster_start: 
            for line in group:
                if "*" in line: 
                    rep_seq = line.partition(" ")[0]
                else:
                    members.append(line.partition(" ")[0])
        if len(rep_seq) == 0:
            continue
        
        if singles:
            cluster_map[rep_seq] = members
        elif len(members) > 0:
            cluster_map[rep_seq] = members
        else:
            continue

In [25]:
len(cluster_map)

463111

In [26]:
cluster_map

defaultdict(list,
            {'AG-908-B16_01149': [],
             'AG-893-D16_00227': [],
             'AG-895-G08_00073': [],
             'AG-909-E04_00816': ['AG-910-F10_00251', 'AG-914-M07_00761'],
             'AG-896-D14_00412': ['AG-898-M15_01106'],
             'AG-913-D15_00035': [],
             'AG-898-N03_00209': ['AG-914-J14_00054'],
             'AG-914-E04_00189': [],
             'AG-895-F09_00367': [],
             'AG-360-P05_00591': ['AG-892-A04_00838', 'AG-919-I04_01233'],
             'AG-891-O19_00014': [],
             'AG-911-K11_00125': [],
             'AG-892-D10_01171': [],
             'AG-891-J04_00229': [],
             'AG-909-D02_00019': [],
             'AG-891-E15_00895': [],
             'AG-359-A11_00348': [],
             'AG-907-B17_00607': [],
             'AG-903-E19_00010': [],
             'AG-359-E06_00309': ['AG-894-A09_00364'],
             'AG-360-K05_00452': [],
             'AG-891-C06_00935': ['AG-892-I21_00399',
              'AG-900

In [27]:
# now split the 90% clustered fasta into 800 pieces...

In [30]:
from pyfaidx import Fasta
from nb_tools import write_fa_record

In [29]:
fa = Fasta("/mnt/scgc/simon/simonsproject/bats248_annotations/bats248_all_orfs_cdhit_c90.fasta")

In [34]:
count = 0
number = 0
missing = []
for k in cluster_map:
    seqs = [k] + cluster_map[k]
    count += len(seqs)
    with open("/mnt/scgc/simon/simonsproject/bats248_vs/blastin/bats248_orfs_cdhit90_{}.fasta".format(number), "a") as oh:
        for s in seqs:
            try:
                rec = fa[s]
            except:
                missing.append(s)
                continue
                
            write_fa_record(rec.long_name, str(rec), oh)          

    if count > 1000:
        count = 0
        number += 1
        print("resetting count", number)

resetting count 1
resetting count 2
resetting count 3
resetting count 4
resetting count 5
resetting count 6
resetting count 7
resetting count 8
resetting count 9
resetting count 10
resetting count 11
resetting count 12
resetting count 13
resetting count 14
resetting count 15
resetting count 16
resetting count 17
resetting count 18
resetting count 19
resetting count 20
resetting count 21
resetting count 22
resetting count 23
resetting count 24
resetting count 25
resetting count 26
resetting count 27
resetting count 28
resetting count 29
resetting count 30
resetting count 31
resetting count 32
resetting count 33
resetting count 34
resetting count 35
resetting count 36
resetting count 37
resetting count 38
resetting count 39
resetting count 40
resetting count 41
resetting count 42
resetting count 43
resetting count 44
resetting count 45
resetting count 46
resetting count 47
resetting count 48
resetting count 49
resetting count 50
resetting count 51
resetting count 52
resetting count 53
re

resetting count 418
resetting count 419
resetting count 420
resetting count 421
resetting count 422
resetting count 423
resetting count 424
resetting count 425
resetting count 426
resetting count 427
resetting count 428
resetting count 429
resetting count 430
resetting count 431
resetting count 432
resetting count 433
resetting count 434
resetting count 435
resetting count 436
resetting count 437
resetting count 438
resetting count 439
resetting count 440
resetting count 441
resetting count 442
resetting count 443
resetting count 444
resetting count 445
resetting count 446
resetting count 447
resetting count 448
resetting count 449
resetting count 450
resetting count 451
resetting count 452
resetting count 453
resetting count 454
resetting count 455
resetting count 456
resetting count 457
resetting count 458
resetting count 459
resetting count 460
resetting count 461
resetting count 462
resetting count 463
resetting count 464
resetting count 465
resetting count 466
resetting count 467


In [32]:
missing

[]

In [36]:
815330/811

1005.3390875462392

subitted job array to c2:
```
#!/bin/bash                                                                                                             

## set name of PBS job                                                                                                  
#PBS -N mica_array

## set the job array variable                                                                                           
#PBS -J 1-812                                                                                                          

## send the environment variables with job                                                                              
#PBS -V

## set the queue                                                                                                        
#PBS -q scgc-route
#PBS -l walltime=24:00:00                                                                                               

#PBS -l ncpus=20                                                                           
                                                 
#PBS -j oe                                                                                              
#PBS -o /home/julia/out/171229_mica_array/${PBS_ARRAY_INDEX}.out

module load mica

num=$(($PBS_ARRAY_INDEX-1))

echo "My job index is: $PBS_ARRAY_INDEX"

echo $PBS_ARRAY_INDEX
echo $num

echo running mica $num

mica-search --p='20' --blastp 'blastp' /mnt/scgc/simon/databases/mica/nr-20150620-mica \
/mnt/scgc/simon/simonsproject/bats248_vs/blastin/bats248_orfs_cdhit90_${num}.fasta \
--blast-args -outfmt \
'6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sallseqid score nident positive gaps ppos qframe sframe qseq sseq qlen slen salltitles' \
-num_alignments 10 -evalue 0.001 \
-out /mnt/scgc/simon/simonsproject/bats248_vs/blastout/bats248_orfs_cdhit90_${num}.out
```