In [1]:
import pickle
import pandas as pd
from collections import defaultdict

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [8]:
with open("../../dataset/data_dict.pkl", "rb") as f:
    data_dict = pickle.load(f)

In [9]:
data_dict["A4VQM5"]

{'uniref_id': 'UniRef50_Q9HUN4',
 'sequence': 'MRPLVATVDLTALRHNYLLAKQCAPQRKAFAVVKANAYGHGAPEAVTALREIADGFAVACLEEAEVIRGCAPEARILLLEGCFEPSEYLRAAELGLDIAVQDARQADWLLAADLARPLNVWLKLDSGMHRLGFSVDGLRECHARLKGAAQVGELNLISHFACADERGHALTETQLERYAELLELPFEHCSLANSAAVLTLPQAHMAWIRPGIMLYGATPFAELSARELGLKPVMTLTGALIAVRDVPVGESVGYGASWVAQRPSRIGTVSCGYADGYPRTAPSGTSVVIHGQRVPLAGRVSMDMLAVDLTDLPQAQLGDAVELWGAQMPIDELAQACGTIGYELLTKVTGRVPRRYIG',
 'fields': ['catalytic activity: EC = 5.1.1.1, L-alanine = D-alanine',
  "cofactor: pyridoxal 5'-phosphate",
  'functional domains: alanine racemase activity, Alanine racemase, Alanine racemase C-terminal domain-like, PLP-binding barrel, pyridoxal phosphate binding',
  'taxonomy: Bacteria, Pseudomonadota, Gammaproteobacteria',
  'protein size: 358 aa, 38485 KDa'],
 'summary': ['This protein is an alanine racemase enzyme found in various bacteria, specifically in the Pseudomonadota phylum of the Gammaproteobacteria class',
  'It is composed of 358 amino acids with a molecular weight of 

In [2]:
d = pd.read_csv("../../test_results/qmqpl5l8_vague-silence-25_e2_s616244.tsv", sep="\t")

In [4]:
alld = defaultdict(list)
for _, r in d.iterrows():
    if r.subject != "taxonomy":
        alld[r.uniprot_id].append([r.expected_answer, r.generated_response])

In [7]:
next(iter({k: v for k, v in alld.items() if len(v)>2}.items()))

('A4VQM5',
 [[' EC = 5.1.1.1, L-alanine = D-alanine',
   ' This protein catalyzes the conversion of L-alanine to D-alanine.'],
  [' alanine racemase activity, Alanine racemase, Alanine racemase C-terminal domain-like, PLP-binding barrel, pyridoxal phosphate binding',
   ' The functional domains of this protein include the alanine racemase domain, Alanine racemase C-terminal domain-like, Alanine racemase, and PLP-binding barrel.'],
  [" pyridoxal 5'-phosphate",
   " The cofactors of this protein are pyridoxal 5'-phosphate and magnesium ions."]])

In [2]:
with open("../../dataset/data_dict.pkl", "rb") as f:
    data_dict = pickle.load(f)
seq_dict = {k: v["sequence"] for k, v in data_dict.items()}

In [13]:
split = pd.read_csv("../../dataset/split_0.csv")
split

Unnamed: 0,uniprot_id,protein_length,uniref_id,split
0,A2YUL5,389,UniRef50_Q6Z256,train
1,Q07646,335,UniRef50_Q07646,test
2,A4WMD4,264,UniRef50_A4WMD4,train
3,O07165,170,UniRef50_O07165,train
4,Q3UV31,285,UniRef50_Q3UV31,train
...,...,...,...,...
252598,Q9NS28,235,UniRef50_Q9NS28,train
252599,Q9TTU2,172,UniRef50_Q8VCP8,train
252600,Q7TTW6,268,UniRef50_Q7V8W2,train
252601,O00399,190,UniRef50_O00399,train


In [7]:
def write_fasta(input_dict, file_name):
    with open(file_name, 'w') as fasta_file:
        for protein_id, sequence in input_dict.items():
            fasta_file.write(f">{protein_id}\n")
            fasta_file.write(f"{sequence}\n")

In [None]:
write_fasta(seq_dict, "Pika-DS.fa")

In [15]:
write_fasta({k: seq_dict[k] for k in split[split.split=="val"]["uniprot_id"].to_list()}, "../../dataset/split_0_val.fa")
write_fasta({k: seq_dict[k] for k in split[split.split=="test"]["uniprot_id"].to_list()}, "../../dataset/split_0_test.fa")

In [5]:
!makeblastdb -in Pika-DS.fa -dbtype prot -out blast_db/Pika_blast_db

zsh:1: command not found: makeblastdb


In [49]:
hout = pd.read_csv("split_0_test_results.out", sep="\t", header=None)
hout

Unnamed: 0,0,1,2
0,Q07646,Q07646,1761
1,Q07646,Q5EB52,1734
2,Q07646,Q2HJM9,1716
3,Q07646,P9WLR0,379
4,Q07646,Q8PDW8,163
...,...,...,...
3583975,Q4X1W0,A7M4E1,70
3583976,Q4X1W0,O86776,69
3583977,Q4X1W0,Q6FV34,69
3583978,Q4X1W0,P37695,68


In [50]:
train_set = set(split[split.split=="train"]["uniprot_id"].to_list())

In [51]:
hmg, scores = defaultdict(list), defaultdict(list)
for _, r in hout.iterrows():
    if r[1] in train_set:
        hmg[r[0]].append(r[1])
        scores[r[0]].append(r[2])

In [52]:
for k in hmg:
    hmg[k] = [x for x, _ in sorted(zip(hmg[k], scores[k]), key=lambda pair: pair[1], reverse=True)]
    scores[k] = [x for x, _ in sorted(zip(scores[k], scores[k]), key=lambda pair: pair[1], reverse=True)]

In [53]:
for v in scores.values():
    assert sorted(v, reverse=True) == v, f"not sorted {v}"

In [31]:
assert sorted(next(iter(scores.values())), reverse=True) == next(iter(scores.values()))

In [54]:
out_path = "../../dataset/split_homology_test_0.pkl"
with open(out_path, "wb") as f:
    pickle.dump(hmg, f)

In [55]:
next(iter(hmg.values()))

['Q8PDW8',
 'P9WNH3',
 'P9WNH2',
 'P9WMS3',
 'P64302',
 'Q6Q3H0',
 'P22643',
 'P24640',
 'P53750',
 'Q02104',
 'G5EDL5',
 'P59336',
 'Q9ZER0',
 'O52866',
 'Q01398',
 'O22977',
 'Q6NAM1',
 'Q8U671',
 'Q9A919',
 'B8H3S9',
 'P95276',
 'I6YC03',
 'Q700D5',
 'P26174',
 'Q8IUS5',
 'Q9FN79',
 'Q6IE26',
 'Q0IIS3',
 'P23133',
 'Q98C03',
 'Q9H6B9',
 'Q55921',
 'O22975',
 'O05235',
 'P34913',
 'P19076',
 'I6YGS0',
 'P52278',
 'Q1JU72',
 'Q48MQ7',
 'P59337',
 'P22862',
 'Q8LFX7',
 'Q7SHI0',
 'Q5FMT1',
 'P9WNH5',
 'P9WNH4',
 'Q97UA2',
 'O31581',
 'Q3V1F8',
 'P23974',
 'Q9BUJ0',
 'Q6GLL2',
 'P07099',
 'P34914',
 'O80472',
 'Q59093',
 'O31158',
 'O80474',
 'Q6U6J0',
 'Q0J0A4',
 'C0ZKI1',
 'P91143',
 'H2KZ86',
 'P46541',
 'F4IMK4',
 'P23106',
 'Q15N09',
 'D7SSD8',
 'A0KF11',
 'Q84JL7',
 'P96084',
 'Q10QA5',
 'F4IMK2',
 'Q7SY73',
 'B7KWT4',
 'P04068',
 'Q25489',
 'O80476',
 'Q890D8',
 'P07687',
 'Q12385',
 'F4JRA6',
 'B5XXN3',
 'Q0JG99',
 'A2WYS7',
 'P41667',
 'Q8MZR6',
 'Q9FW03',
 'P79381',
 'P27652',

In [2]:
with open("evo_split_homology_test.pkl", "rb") as f:
    out = pickle.load(f)

In [4]:
next(iter(out.items()))

('A0A068BGA5',
 ['Q3YRE3',
  'Q5HAK8',
  'Q84XW5',
  'Q04207',
  'Q87ZB2',
  'Q8TE68',
  'P14713',
  'Q8MR37'])

In [6]:
sorted([(k, v) for k, v in out.items()], key=lambda x: len(x[1]))[:10]

[('P0DQH4', ['F4J2K2']),
 ('P48181', ['Q65X71']),
 ('P49579', ['Q8KWT2']),
 ('P58367', ['C5E4T8']),
 ('P83451', ['Q14679']),
 ('Q29I93', ['Q99102']),
 ('Q2UBM1', ['Q6XZF7']),
 ('Q3MQ24', ['Q50787']),
 ('Q69UD7', ['Q5RFM9']),
 ('Q7CUX4', ['Q8TXX9'])]