In [1]:
import pandas as pd
from Bio import SeqIO, AlignIO

In [2]:
seq = SeqIO.parse("uTP_HMM_hits.fasta", "fasta")
seq = list(seq)

In [3]:
df = pd.read_csv('ADK1075_ProteinQuantifications.csv')

In [4]:
def filter(row):
    return type(row['Sequence source']) == str and row['Sequence source'].startswith('B. bigelowii') and (row['Significant_Day'] == 'UCYN-A' or row['Significant_Night'] == 'UCYN-A')

ucyn_a_prots = df[df.apply(filter, axis=1)]['Time'].to_list()
ucyn_a_prots = set(ucyn_a_prots)

In [5]:
ucyn_a_prots_seqs = [ s for s in seq if s.id in ucyn_a_prots ]

In [6]:
c_terminal_sequences = []
for record in ucyn_a_prots_seqs:
    seq = str(record.seq)
    c_terminal_sequences.append((record.id, seq[-140:]))


In [7]:
# Write c-terminal sequences to fasta file
with open('ucyn_a_c_terminal.fasta', 'w') as f:
    for id, seq in c_terminal_sequences:
        f.write(f'>seq_{id}\n{seq}\n')

In [8]:
import subprocess
from tqdm import tqdm

def cluster(e_val, min_cov, min_seq_id):
    tqdm.write(f"Params {e_val} {min_cov} {min_seq_id}")
    # mmseqs easy-cluster ucyn_a_c_terminal.fasta result/c-term temp <options>
    subprocess.run(f'mmseqs easy-cluster ucyn_a_c_terminal.fasta result/c-term temp --min-seq-id {min_seq_id} -c {min_cov} -e {e_val}', shell=True, stdout=subprocess.PIPE)
    
    # Read clustered sequences from result/c-term_cluster.tsv
    df = pd.read_csv('result/c-term_cluster.tsv', sep='\t', header=None)
    df.columns = ['cluster-representative', 'cluster-member']

    cluster_df = df.groupby('cluster-representative').aggregate(lambda x: list(x))
    cluster_df['count'] = cluster_df['cluster-member'].apply(len)
    cluster_df = cluster_df.sort_values('count', ascending=False)
    tqdm.write(f"Count {len(cluster_df[cluster_df['count'] > 1])}")

    # Save cluster representative sequences to csv with the parameters used
    cluster_df.to_csv(f'clustered/clustered_{e_val}_{min_cov}_{min_seq_id}.csv')

# Run clustering with different parameters
e_vals = [0.001, 0.005]
min_covs = [0.7, 0.8, 0.9]
min_seq_ids = [0.0, 0.05, 0.1, 0.3]
combinations = [(e, cov, seq_id) for e in e_vals for cov in min_covs for seq_id in min_seq_ids]

for e_val, min_cov, min_seq_id in tqdm(combinations):
    cluster(e_val, min_cov, min_seq_id)

  0%|          | 0/24 [00:00<?, ?it/s]

Params 0.001 0.7 0.0


  4%|▍         | 1/24 [00:02<01:02,  2.73s/it]

Count 18
Params 0.001 0.7 0.05


  8%|▊         | 2/24 [00:05<00:57,  2.60s/it]

Count 18
Params 0.001 0.7 0.1


 12%|█▎        | 3/24 [00:07<00:53,  2.55s/it]

Count 18
Params 0.001 0.7 0.3


 17%|█▋        | 4/24 [00:10<00:52,  2.60s/it]

Count 15
Params 0.001 0.8 0.0


 21%|██        | 5/24 [00:12<00:49,  2.59s/it]

Count 19
Params 0.001 0.8 0.05


 25%|██▌       | 6/24 [00:15<00:46,  2.56s/it]

Count 19
Params 0.001 0.8 0.1


 29%|██▉       | 7/24 [00:17<00:42,  2.53s/it]

Count 19
Params 0.001 0.8 0.3


 33%|███▎      | 8/24 [00:20<00:40,  2.53s/it]

Count 16
Params 0.001 0.9 0.0


 38%|███▊      | 9/24 [00:23<00:38,  2.57s/it]

Count 20
Params 0.001 0.9 0.05


 42%|████▏     | 10/24 [00:26<00:37,  2.69s/it]

Count 20
Params 0.001 0.9 0.1


 46%|████▌     | 11/24 [00:28<00:35,  2.75s/it]

Count 20
Params 0.001 0.9 0.3


 50%|█████     | 12/24 [00:31<00:31,  2.64s/it]

Count 18
Params 0.005 0.7 0.0


 54%|█████▍    | 13/24 [00:33<00:28,  2.58s/it]

Count 18
Params 0.005 0.7 0.05


 58%|█████▊    | 14/24 [00:36<00:25,  2.54s/it]

Count 18
Params 0.005 0.7 0.1


 62%|██████▎   | 15/24 [00:39<00:24,  2.69s/it]

Count 18
Params 0.005 0.7 0.3


 67%|██████▋   | 16/24 [00:42<00:21,  2.73s/it]

Count 15
Params 0.005 0.8 0.0


 71%|███████   | 17/24 [00:45<00:19,  2.83s/it]

Count 19
Params 0.005 0.8 0.05


 75%|███████▌  | 18/24 [00:47<00:16,  2.72s/it]

Count 19
Params 0.005 0.8 0.1


 79%|███████▉  | 19/24 [00:50<00:13,  2.63s/it]

Count 19
Params 0.005 0.8 0.3


 83%|████████▎ | 20/24 [00:52<00:10,  2.66s/it]

Count 16
Params 0.005 0.9 0.0


 88%|████████▊ | 21/24 [00:55<00:08,  2.69s/it]

Count 20
Params 0.005 0.9 0.05


 92%|█████████▏| 22/24 [00:58<00:05,  2.64s/it]

Count 20
Params 0.005 0.9 0.1


 96%|█████████▌| 23/24 [01:01<00:02,  2.75s/it]

Count 20
Params 0.005 0.9 0.3


100%|██████████| 24/24 [01:04<00:00,  2.67s/it]

Count 18





In [31]:
import sklearn.metrics as metrics
import numpy as np
import pyopa
from tqdm import tqdm
from Bio import pairwise2
from Bio.Align import substitution_matrices

sub_matrix = substitution_matrices.load("BLOSUM62")

pairs = [ ((id1, seq1), (id2, seq2)) for id1, seq1 in c_terminal_sequences for id2, seq2 in c_terminal_sequences if id1 != id2 ]
distances = [
    (id1, id2, pairwise2.align.globalds(s1, s2, sub_matrix, -10, -0.5, score_only=True))
    for (id1, s1), (id2, s2) in tqdm(pairs)
]
distances[0]

100%|██████████| 37056/37056 [02:35<00:00, 238.58it/s]


('KC1-P2-N_CL1062Contig1_1', 'KC1-P2-N_CL2249Contig1_1', 77.5)

In [45]:
# Find sequence pairs with the smallest distance
max_score_pair = max(
    ((id1, id2, dist) for id1, id2, dist in distances), key=lambda x: x[2]
)[:2]
# Find sequences
bseq1 = next(s for s in c_terminal_sequences if s[0] == max_score_pair[0])
bseq2 = next(s for s in c_terminal_sequences if s[0] == max_score_pair[1])

print(f"Most similar sequences: {max_score_pair[0]} {max_score_pair[1]}")
print(f"Sequence 1: {bseq1[1]}")
print(f"Sequence 2: {bseq2[1]}")

# Find least similar sequences
min_score_pair = min(
    ((id1, id2, dist) for id1, id2, dist in distances), key=lambda x: x[2]
)[:2]

# Find sequences
wseq1 = next(s for s in c_terminal_sequences if s[0] == min_score_pair[0])
wseq2 = next(s for s in c_terminal_sequences if s[0] == min_score_pair[1])

print()
print(f"Least similar sequences: {min_score_pair[0]} {min_score_pair[1]}")
print(f"Sequence 1: {wseq1[1]}")
print(f"Sequence 2: {wseq2[1]}")

# Find the alignment between the most similar sequences
alignment = pairwise2.align.globalds(bseq1[1], bseq2[1], sub_matrix, -10, -0.5)
print()
print("Most similar alignment:")
print(pairwise2.format_alignment(*alignment[0]))


# Find the alignment between the least similar sequences
alignment = pairwise2.align.globalds(wseq1[1], wseq2[1], sub_matrix, -10, -0.5)
print()
print("Least similar alignment:")
print(pairwise2.format_alignment(*alignment[0]))


Most similar sequences: KC1-P2-N_k69_Locus_12056_Transcript_1_1 KC1-P2_N3_k55_Locus_6321_Transcript_1_1
Sequence 1: LGQRLQALLHASLLGAPPAVSGALVPESGCEWVQQELNELQLPPFPELGGFEFHLPPLPRLLPDTRGFEGLRSRLPSAPTTLAGHPTVVAEAATVAAASPSPLAAEQQQARWGAAAVGAGAGVAVSALFAFAFVGCSRRR
Sequence 2: AGALAPEAGCEWVQRELNELQLPDFPDLGGFEFRLPPLPRLLPGTPDFEGLRSRLPSAVPTTLAGKPPGVAEAATVAAASPSPFVAAEQQQARWGAAAAGAGAGVTVSALFVFAFVGSRRRGSTGRAGAPPAHSGRRCQR

Least similar sequences: KC1-P2-N_k25_Locus_23255_Transcript_1_1 KC1-P2-N_k37_Locus_2012_Transcript_10_1
Sequence 1: ATPGAGCEWVKDFKQKFPDFPDFPDFPFKFELPPVPALPGRLLPQTLLEAAHRPLDPPAAAAAEKGGLDLSELTAAGLGFGGGMLVVGAVGALSLRRKGRLDLQIEEARRMEAAEAQGAVPASVAGLPALSLSSSRQMSA
Sequence 2: SWEQLQSLLVQQQQRARAICRRATAESDSGERHQSSTLESDIRAVHRRATLERTPESDTEERQWRATPKSDTESDSGRATVESDSGERHRRATTPESDSSNFCTSASPNPRAVPSPSERDSLGLWRKGPFEKRALPCFCV

Most similar alignment:
LGQRLQALLHASLLGAPPAVSGALVPESGCEWVQQELNELQLPPFPELGGFEFHLPPLPRLLPDTRGFEGLRSRLPSA-PTTLAGHPTVVAEAATVAAASPSP-LAAEQQQARWGAAAVGAGAGVAVSALFAFAFVGCSRR------------

In [63]:
distances_dict = { (id1, id2): dist for id1, id2, dist in distances }
min_score = min(distances_dict.values())
# Add |min_score| to all scores so that all score are positive
distances_dict = { (id1, id2): dist + abs(min_score) for (id1, id2), dist in distances_dict.items() }

# Compute distance matrix
n = len(c_terminal_sequences)
distance_matrix = np.zeros((n, n), dtype=np.float64)
for i, (id1, _) in enumerate(c_terminal_sequences):
    for j, (id2, _) in enumerate(c_terminal_sequences):
        if id1 == id2:
            distance_matrix[i, j] = 0
        else:
            score_df = distances_dict[(id1, id2)]
            distance_matrix[i, j] = (1/score_df) if score_df != 0 else 2.0

In [66]:
import os
def score_df(cluster_df):
    labels = []
    for id, _ in c_terminal_sequences:
        cluster = cluster_df[cluster_df['cluster-member'].apply(lambda x: id in x)]
        assert len(cluster) == 1, f"Could not find cluster for {id}"
        labels.append(cluster.index[0])
    return metrics.silhouette_score(distance_matrix, labels, metric='precomputed')

directory = 'clustered'
files = os.listdir(directory)
results = {}
for file in files:
    if os.path.isfile(os.path.join(directory, file)):
        results[file] = score_df(pd.read_csv(os.path.join(directory, file)))

In [67]:
max(( (file, sc) for file, sc in results.items()  ), key=lambda r: r[1])

('clustered_0.005_0.9_0.3.csv', -0.01893977566782182)

In [70]:
import os
def score_df(cluster_df):
    labels = []
    for id, _ in c_terminal_sequences:
        cluster = cluster_df[cluster_df['cluster-member'].apply(lambda x: id in x)]
        assert len(cluster) == 1, f"Could not find cluster for {id}"
        labels.append(cluster.index[0])
    
    # Compute and score consensus sequence for each cluster
    for cluster in cluster_df.iterrows():
        cluster_id = cluster[0]
        cluster_seqs = [ c_terminal_sequences[int(i)][0] for i in cluster[1]['cluster-member'] ]
        
    

directory = 'clustered'
files = os.listdir(directory)
results = {}
for file in files:
    if os.path.isfile(os.path.join(directory, file)):
        results[file] = score_df(pd.read_csv(os.path.join(directory, file)))
        break

Index([5], dtype='int64')
Index([1], dtype='int64')
Index([96], dtype='int64')
Index([4], dtype='int64')
Index([2], dtype='int64')
Index([0], dtype='int64')
Index([11], dtype='int64')
Index([86], dtype='int64')
Index([39], dtype='int64')
Index([2], dtype='int64')
Index([49], dtype='int64')
Index([53], dtype='int64')
Index([12], dtype='int64')
Index([22], dtype='int64')
Index([4], dtype='int64')
Index([25], dtype='int64')
Index([29], dtype='int64')
Index([33], dtype='int64')
Index([5], dtype='int64')
Index([81], dtype='int64')
Index([4], dtype='int64')
Index([5], dtype='int64')
Index([83], dtype='int64')
Index([95], dtype='int64')
Index([88], dtype='int64')
Index([2], dtype='int64')
Index([78], dtype='int64')
Index([64], dtype='int64')
Index([8], dtype='int64')
Index([16], dtype='int64')
Index([57], dtype='int64')
Index([4], dtype='int64')
Index([0], dtype='int64')
Index([75], dtype='int64')
Index([74], dtype='int64')
Index([3], dtype='int64')
Index([73], dtype='int64')
Index([3], dtype