Goal: Collect results from new round of MMSeq clustering
Output: an array with two columns, original ORF and new 2nd round clustered ORF
For each of the ~1300 original ORFs (in the case of nar), find its 1st round cluster in the subset folder.
Then, find its second round cluster from MMSeq output. 

Then, use the output to find a new grouping of variants

If you run MMSeq again, all of this can be repeated with the run build_new_map script

In [1]:
import numpy as np
import pandas as pd
from functions import find_orfs, get_filepath

soils = ['Soil3', 'Soil5', 'Soil6', 'Soil9', 'Soil11', 'Soil12', 'Soil14', 'Soil15', 'Soil16', 'Soil17']



(0) Find all ORFs

In [2]:
ORFs = find_orfs(get_filepath('T0', 'annotation', 'K02567'), 'K02567')
ORF_complete = list(ORFs)
    

for soil in soils:
    ORFs = find_orfs(get_filepath(soil, 'annotation', 'K02567'), 'K02567')
    ORF_list = list(ORFs)
    ORF_complete = ORF_complete + ORF_list
    
print(ORF_complete)
print(len(ORF_complete))

['T0.scaffold_826610199_c1_1', 'T0.scaffold_170828508_c1_1', 'T0.scaffold_785029462_c1_5', 'T0.scaffold_500133190_c1_1', 'T0.scaffold_915148188_c1_1', 'T0.scaffold_133727621_c1_1', 'T0.scaffold_629956420_c1_1', 'T0.scaffold_915288777_c1_1', 'T0.scaffold_11476034_c1_1', 'T0.scaffold_61541039_c1_1', 'T0.scaffold_831992212_c1_1', 'Soil3.scaffold_38977414_c1_1', 'Soil3.scaffold_374008697_c1_11', 'Soil3.scaffold_484563224_c1_1', 'Soil3.scaffold_404563946_c1_1', 'Soil5.scaffold_42391532_c1_4', 'Soil5.scaffold_349892274_c1_26', 'Soil5.scaffold_151631252_c1_54', 'Soil5.scaffold_498157886_c1_1', 'Soil5.scaffold_243654554_c1_4', 'Soil6.scaffold_534384613_c1_1', 'Soil9.scaffold_403279197_c1_1', 'Soil9.scaffold_278269692_c1_1', 'Soil9.scaffold_399115430_c1_5', 'Soil9.scaffold_471201764_c1_2', 'Soil9.scaffold_422795741_c1_2', 'Soil9.scaffold_226354075_c1_6', 'Soil9.scaffold_754790964_c1_1', 'Soil9.scaffold_614425102_c1_1', 'Soil11.scaffold_222086193_c1_1', 'Soil11.scaffold_240683483_c1_1', 'Soil11.

(1), Create the array mapping original ORF to new ORF. 

(1a) Create a mapping from original ORF to 1st cluster ORF

In [3]:

def find_1st_orf(orf, file_path):
    df = pd.read_csv(file_path, sep = '\t', header = None)
    return df[df[0] == orf][1].iloc[0]

example = "T0.scaffold_419240115_c1_16"
second = find_1st_orf(example, file_path_1)
print(f"the second ID for {example} is: {second}")

NameError: name 'file_path_1' is not defined

create a list of 1st round cluster IDs

In [None]:
prefixes = ['T0', 'Soil3', 'Soil5', 'Soil6', 'Soil9', 'Soil11', 'Soil12', 'Soil14', 'Soil15', 'Soil16', 'Soil17']
file_path_list = []
for prefix in prefixes:
    file_path_list.append(get_filepath(prefix, 'annotation_K00370'))

In [None]:
orf_1_list = []
for i, prefix in enumerate(prefixes):
    ORF_list = find_orfs(file_path_list[i], 'K00370')
    for orf in ORF_list:
        orf_1 = find_1st_orf(orf, file_path_list[i])
        if orf_1 not in orf_1_list:
            orf_1_list.append(orf_1)
        
print(orf_1_list)
print(len(orf_1_list))

(1b) Create a mapping from 1st cluster ORF to new 2nd cluster ORF

In [None]:
from concurrent.futures import ThreadPoolExecutor

def find_2nd_orf_chunk(id, chunk):
    """Search a chunk for the ID."""
    match = chunk[chunk[1] == id]
    return match[0].iloc[0] if not match.empty else None

def find_2nd_orf_parallel(id, file_path, chunksize=10_000_000):
    """Parallel chunked search."""
    with ThreadPoolExecutor(max_workers=8) as executor:
        for chunk in pd.read_csv(file_path, sep='\t', header=None, 
                               usecols=[0, 1], dtype=str, chunksize=chunksize):
            result = executor.submit(find_2nd_orf_chunk, id, chunk).result()
            if result is not None:
                return result
    return None

file_path = 'data/raw_data/all.coassembly_proteins_1st_ClusterDB_repseq_2ndClusterDB.tsv'
# Usage:
#answer: Soil15.scaffold_1276187927_c1_1
test = 'Soil14.scaffold_984904475_c1_1'
print(find_2nd_orf_parallel(test, file_path))

(2) Create a new ORF list, with the new 2nd cluster

In [None]:
#Needs to be ran on quest

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
from collections import defaultdict

from functions import find_orfs, get_filepath




prefixes = ['T0', 'Soil3', 'Soil5', 'Soil6', 'Soil9', 'Soil11', 'Soil12', 'Soil14', 'Soil15', 'Soil16', 'Soil17']
file_path_list = []
for prefix in prefixes:
    file_path_list.append(get_filepath(prefix, 'annotation_K00370'))

#Want to produce "map" with 1336 rows corresponding to K00370 ORFs

map = []

#The 2nd Column is the round 1 cluster ORFs

#The 3rd Column is the round 2 clustered ORFs, with the new clustering 

#produce a list of round 1 ORFs
def find_1st_orf(orf, file_path):
    df = pd.read_csv(file_path, sep = '\t', header = None)
    return df[df[0] == orf][1].iloc[0]

orf_1_list = []
for i in range(len(prefixes)):
    ORF_list = find_orfs(file_path_list[i], 'K00370')
    for orf in ORF_list:
        orf_1 = find_1st_orf(orf, file_path_list[i])
        map.append([orf, orf_1, 'blank'])
        if orf_1 not in orf_1_list:
            orf_1_list.append(orf_1)
            
print('orf_1_list: ', orf_1_list)
print('len: ', len(orf_1_list))
        
print('Completed 1st Round')

import pandas as pd

def build_targeted_lookup(file_path, target_orfs):
    """Memory-efficient lookup builder"""
    lookup_dict = {}
    target_orfs = set(target_orfs)
    
    # Use low-memory chunking
    for chunk in pd.read_csv(
        file_path,
        sep='\t',
        header=None,
        usecols=[0, 1],
        dtype=str,
        chunksize=10_000_000
    ):
        # Vectorized filtering
        mask = chunk[1].isin(target_orfs)
        filtered = chunk[mask]
        lookup_dict.update(zip(filtered[1], filtered[0]))
    
    return lookup_dict

# Usage:
file_path = 'data/raw_data/all.coassembly_proteins_1st_ClusterDB_repseq_2ndClusterDB.tsv'
lookup_dict = build_targeted_lookup(file_path, orf_1_list)  # Do this ONCE

def find_2nd_orf(id, lookup_dict):
    """O(1) lookup from preloaded dictionary."""
    return lookup_dict.get(id, None)

for entry in map:
    entry[2] = find_2nd_orf(entry[1])
    print(entry[2])

    

np.savetxt("out/cluster08map_nar.tsv", map, delimiter = '\t', fmt = '%s')

In [None]:
##Needs to be ran on quest

prefixes = ['T0', 'Soil3', 'Soil5', 'Soil6', 'Soil9', 'Soil11', 'Soil12', 'Soil14', 'Soil15', 'Soil16', 'Soil17']
file_path_list = []
for prefix in prefixes:
    file_path_list.append(get_filepath(prefix, 'annotation_K00370'))

#Want to produce "map" with 1336 rows corresponding to K00370 ORFs

map = []

#The 2nd Column is the round 1 cluster ORFs

#The 3rd Column is the round 2 clustered ORFs, with the new clustering 

#produce a list of round 1 ORFs
def find_1st_orf(orf, file_path):
    df = pd.read_csv(file_path, sep = '\t', header = None)
    return df[df[0] == orf][1].iloc[0]

orf_1_list = []
for i in range(len(prefixes)):
    ORF_list = find_orfs(file_path_list[i], 'K00370')
    for orf in ORF_list:
        orf_1 = find_1st_orf(orf, file_path_list[i])
        map.append([orf, orf_1, 'blank'])
        if orf_1 not in orf_1_list:
            orf_1_list.append(orf_1)
            
print('orf_1_list: ', orf_1_list)
print('len: ', len(orf_1_list))
        
print('Completed 1st Round')

def build_targeted_lookup(file_path, target_orfs):
    """Memory-efficient lookup builder"""
    lookup_dict = {}
    target_orfs = set(target_orfs)
    
    # Use low-memory chunking
    for chunk in pd.read_csv(
        file_path,
        sep='\t',
        header=None,
        usecols=[0, 1],
        dtype=str,
        chunksize=10_000_000
    ):
        # Vectorized filtering
        mask = chunk[1].isin(target_orfs)
        filtered = chunk[mask]
        lookup_dict.update(zip(filtered[1], filtered[0]))
    
    return lookup_dict

# Usage:
file_path = 'data/raw_data/all.coassembly_proteins_1st_ClusterDB_repseq_2ndClusterDB.tsv'
lookup_dict = build_targeted_lookup(file_path, orf_1_list)  # Do this ONCE

print('Dictionary Built')

def find_2nd_orf(id, lookup_dict):
    """O(1) lookup from preloaded dictionary."""
    return lookup_dict.get(id, None)

for entry in map:
    entry[2] = find_2nd_orf(entry[1], lookup_dict)
    print(entry[2])

np.savetxt("out/new_cluster_map.tsv", map, delimiter = '\t', fmt = '%s')
    

(3) Collect abundances, given the new list

In [None]:
cluster_map = pd.read_csv('out/cluster08map_nar.tsv', sep = '\t', header=None)
cluster_map = cluster_map.values
print(cluster_map)

cluster_IDs = []

for cluster in cluster_map:
    if cluster[2] not in cluster_IDs:
        cluster_IDs.append(cluster[2])
        
print(len(cluster_IDs))
print(cluster_IDs)
np.savetxt("out/cluster_ids_271_nar.tsv", cluster_IDs, delimiter = '\t', fmt = '%s')

[['T0.scaffold_563122406_c1_2' 'T0.scaffold_563122406_c1_2'
  'Soil17.scaffold_102369427_c1_1']
 ['T0.scaffold_610518454_c1_1' 'T0.scaffold_610518454_c1_1'
  'Soil3.scaffold_26587228_c1_1']
 ['T0.scaffold_914934534_c1_1' 'T0.scaffold_914934534_c1_1'
  'Soil9.scaffold_63696370_c1_1']
 ...
 ['Soil17.scaffold_221362183_c1_2' 'Soil17.scaffold_221362183_c1_2'
  'Soil17.scaffold_221362183_c1_2']
 ['Soil17.scaffold_102369427_c1_1' 'Soil17.scaffold_102369427_c1_1'
  'Soil17.scaffold_102369427_c1_1']
 ['Soil17.scaffold_1100384366_c1_1' 'Soil17.scaffold_1100384366_c1_1'
  'Soil15.scaffold_116822102_c1_19']]
271
['Soil17.scaffold_102369427_c1_1', 'Soil3.scaffold_26587228_c1_1', 'Soil9.scaffold_63696370_c1_1', 'Soil5.scaffold_113795878_c1_1', 'Soil9.scaffold_353055196_c1_5', 'T0.scaffold_308047681_c1_10', 'Soil6.scaffold_47353953_c1_4', 'Soil6.scaffold_370324234_c1_1', 'Soil5.scaffold_426188557_c1_1', 'Soil5.scaffold_549637727_c1_1', 'Soil16.scaffold_1078417100_c1_2', 'Soil3.scaffold_504725650_c1_