#### Read the genome neighborhood analysis result

In [2]:
import pickle
import pandas as pd

# Read the test file into a dataframe
df = pd.read_csv("../data/Other_Files/pfam_neighbors_PF02458.txt", delimiter='\t')
df.to_csv("../output/pfam_neighbors_PF02458.csv")

# Save the query and neighbor IDs into two separate txt files for the SSN submission
# Query
Query_ID = []
for id in df['Query ID'].to_list():
    Query_ID.append(id)
with open('../output/PF01494_with_PF02458.txt', 'w') as fp:
    for id in Query_ID:
        fp.write("%s\n" % id) # write each item on a new line

# Neighbor
Neighbor_ID = []
for id in df['Neighbor ID'].to_list():
    Neighbor_ID.append(id)
with open('../output/PF02458_with_PF01494.txt', 'w') as fp:
    for id in Neighbor_ID:
        fp.write("%s\n" % id) # write each item on a new line

# Create a "PF01494 to PF02458" dictionary and a "PF02458 to PF01494" dictionary and save them
PF01494_to_PF02458_dict = {}
PF02458_to_PF01494_dict = {}
for i, j in zip(Query_ID, Neighbor_ID):
    PF01494_to_PF02458_dict[i] = j
    PF02458_to_PF01494_dict[j] = i

with open("../output/PF01494_to_PF02458_dict.pkl", 'wb') as file_handle:
     pickle.dump(PF01494_to_PF02458_dict, file_handle)

with open("../output/PF02458_to_PF01494_dict.pkl", 'wb') as file_handle:
     pickle.dump(PF02458_to_PF01494_dict, file_handle)


#### Analyze the clusterings of PF01494 and PF02458 by the AMI score

In [6]:
import pickle
import pandas as pd
from sklearn.metrics.cluster import adjusted_mutual_info_score

# Read the PF01494 clustering file
df2 = pd.read_csv("../data/PF01494_SSN_Score150_cluster.txt", delimiter='\t')
df2.to_csv("../output/PF01494_SSN_Score150_cluster.csv")
PF01494_cluster_id = df2['UniProt ID'].to_list()
PF01494_cluster = df2['Cluster Number'].to_list()
PF01494_id_to_cluster = {}
for i, j in zip(PF01494_cluster_id, PF01494_cluster):
    PF01494_id_to_cluster[i] = j
    
# Read the PF02458 clustering file
df3 = pd.read_csv("../data/PF02458_SSN_Score150_cluster.txt", delimiter='\t')
df3.to_csv("../output/PF02458_SSN_Score150_cluster.csv")
df3['UniProt ID'].to_list()
PF02458_cluster_id = df3['UniProt ID'].to_list()
PF02458_cluster = df3['Cluster Number'].to_list()
PF02458_id_to_cluster = {}
for n, k in zip(PF02458_cluster_id, PF02458_cluster):
    PF02458_id_to_cluster[n] = k

# Read the PF01494_and_PF02458 dictionary
with open("../output/PF01494_to_PF02458_dict.pkl", 'rb') as file_handle:
    PF01494_to_PF02458_dict = pickle.load(file_handle)

# Create a list for PF01494 to PF02458 pairs that are not singletons
PF01494_and_PF02458 = []
for id in PF01494_cluster_id:
    if PF01494_to_PF02458_dict[id] in PF02458_cluster_id:
        PF01494_and_PF02458.append(id)
print("Total pairs: ", len(PF01494_and_PF02458))

updated_PF01494_cluster = []
updated_PF02458_cluster = []
for PF01494_id in PF01494_and_PF02458:
    updated_PF01494_cluster.append(PF01494_id_to_cluster[PF01494_id])
    updated_PF02458_cluster.append(PF02458_id_to_cluster[PF01494_to_PF02458_dict[PF01494_id]])

# Calculate the Adjusted Mutual Information (AMI) score
AMI = adjusted_mutual_info_score(updated_PF01494_cluster, updated_PF02458_cluster)
print("AMI score: ", AMI)


Total pairs:  167
AMI score:  0.8483212948915777
