# Hierarchical Clustering

In [1]:
# importing required packages
import pandas as pd
from sklearn.metrics.cluster import normalized_mutual_info_score
import csv
import os

## Preprocessing of Data for Clustering

In [2]:
# reading the csv file and creating a dataframe
df = pd.read_csv("df_na_full.csv")

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,"Ni_et_al_2016_13, 8, selectivity=0.125","Weiser_et_al_2017_28, 47, selectivity=0.02127659574468085","Ni_et_al_2016_11, 16, selectivity=0.0625","Ni_et_al_2016_8, 24, selectivity=0.041666666666666664","Weiser_et_al_2017_25, 122, selectivity=0.00819672131147541","Ni_et_al_2016_1, 72, selectivity=0.013888888888888888","Ni_et_al_2016_3, 67, selectivity=0.014925373134328358","Ni_et_al_2016_9, 15, selectivity=0.06666666666666667","Ni_et_al_2016_4, 81, selectivity=0.012345679012345678",...,"Cecere_et_al_2014_5, 3709, selectivity=0.00026961445133459155","Gushchanskaia_et_al_2019_3, 1488, selectivity=0.0006720430107526882","Garrigues_et_al_2022_2, 5220, selectivity=0.00019157088122605365","Nguyen_et_al_2021_11, 1373, selectivity=0.0007283321194464676","Singh_et_al_2021_11, 1417, selectivity=0.0007057163020465773","Reed_et_al_2020_11, 2995, selectivity=0.000333889816360601","Gushchanskaia_et_al_2019_4, 5000, selectivity=0.0002","Chaves_et_al_2021_3, 3236, selectivity=0.00030902348578491963","Quarato_et_al_2021_3, 1432, selectivity=0.0006983240223463687","Gushchanskaia_et_al_2019_5, 4000, selectivity=0.00025"
0,"WBGene00016953, r-score = 0.005896752388133557",,1.0,,,1.0,,,,,...,,,,,,,,,,
1,"WBGene00017069, r-score = 0.00536004723543325",1.0,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,...,,,,,,,,,,
2,"WBGene00021239, r-score = 0.0048580828900456055",,,,,1.0,,,,,...,,,,,,,,,,
3,"WBGene00007454, r-score = 0.004519726505793647",,1.0,,1.0,1.0,1.0,,,1.0,...,,,,,,,,,,
4,"WBGene00008447, r-score = 0.004338803221323566",,,1.0,,,,,,,...,,,,,,,,,,


In [4]:
# renaming the column containing gene names to "Gene"
df.rename(columns={'Unnamed: 0': 'Gene'}, inplace=True)

In [5]:
# transposing the dataframe since genes are to be clustered
df.set_index('Gene', inplace=True)
df = df.transpose()

In [6]:
df

Gene,"WBGene00016953, r-score = 0.005896752388133557","WBGene00017069, r-score = 0.00536004723543325","WBGene00021239, r-score = 0.0048580828900456055","WBGene00007454, r-score = 0.004519726505793647","WBGene00008447, r-score = 0.004338803221323566","WBGene00022730, r-score = 0.00429578208852772","WBGene00007489, r-score = 0.0042644969761254795","WBGene00007624, r-score = 0.0042188951864913175","WBGene00007675, r-score = 0.004203752766251299","WBGene00011059, r-score = 0.00418053723599639",...,"WBGene00010989, r-score = 0.002213377470150565","WBGene00011964, r-score = 0.002143230750513301","WBGene00015735, r-score = 0.002128297057884726","WBGene00018878, r-score = 0.0021258199951248702","WBGene00012290, r-score = 0.002110405573624195","WBGene00018310, r-score = 0.001950511622303178","WBGene00013152, r-score = 0.0018155460025849946","WBGene00010100, r-score = 0.0017855810811114455","WBGene00007307, r-score = 0.0017487758922683986","WBGene00021106, r-score = 0.0011581393056345013"
"Ni_et_al_2016_13, 8, selectivity=0.125",,1.0,,,,,,,,,...,,,1.0,,,1.0,,,,
"Weiser_et_al_2017_28, 47, selectivity=0.02127659574468085",1.0,1.0,,1.0,,,,,,1.0,...,,1.0,,1.0,,1.0,,,,
"Ni_et_al_2016_11, 16, selectivity=0.0625",,1.0,,,1.0,,,,,,...,,,1.0,,,1.0,,,,
"Ni_et_al_2016_8, 24, selectivity=0.041666666666666664",,1.0,,1.0,,,,,,,...,,,,,,1.0,,,,
"Weiser_et_al_2017_25, 122, selectivity=0.00819672131147541",1.0,1.0,1.0,1.0,,,1.0,,1.0,1.0,...,,1.0,1.0,1.0,,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Reed_et_al_2020_11, 2995, selectivity=0.000333889816360601",,,,,,,,,,,...,,,,,,,,,,
"Gushchanskaia_et_al_2019_4, 5000, selectivity=0.0002",,,,,,,,,,,...,,,,,,1.0,,,1.0,1.0
"Chaves_et_al_2021_3, 3236, selectivity=0.00030902348578491963",,,,,,,,,,,...,,,,,,,,,,
"Quarato_et_al_2021_3, 1432, selectivity=0.0006983240223463687",,,,,,,,,,,...,,,,,,,,,,


In [7]:
# renaming columns for readability
ds_old = df.columns
ds_new = []
for ds in ds_old:
    ds_new.append(ds[ :ds.find(",")])

print(ds_new)

['WBGene00016953', 'WBGene00017069', 'WBGene00021239', 'WBGene00007454', 'WBGene00008447', 'WBGene00022730', 'WBGene00007489', 'WBGene00007624', 'WBGene00007675', 'WBGene00011059', 'WBGene00008010', 'WBGene00008862', 'WBGene00044258', 'WBGene00003915', 'WBGene00001855', 'WBGene00000405', 'WBGene00011805', 'WBGene00016885', 'WBGene00018204', 'WBGene00019598', 'WBGene00006537', 'WBGene00010012', 'WBGene00021018', 'WBGene00004976', 'WBGene00017641', 'WBGene00003235', 'WBGene00022834', 'WBGene00008681', 'WBGene00018199', 'WBGene00010507', 'WBGene00003159', 'WBGene00004751', 'WBGene00021019', 'WBGene00002219', 'WBGene00007303', 'WBGene00010212', 'WBGene00009247', 'WBGene00014082', 'WBGene00003920', 'WBGene00000915', 'WBGene00001638', 'WBGene00002225', 'WBGene00000473', 'WBGene00044502', 'WBGene00018223', 'WBGene00012198', 'WBGene00020183', 'WBGene00017101', 'WBGene00014148', 'WBGene00019174', 'WBGene00012961', 'WBGene00016512', 'WBGene00019070', 'WBGene00009572', 'WBGene00016029', 'WBGene00

In [8]:
# renaming the columns
df.columns = ds_new

In [9]:
df

Unnamed: 0,WBGene00016953,WBGene00017069,WBGene00021239,WBGene00007454,WBGene00008447,WBGene00022730,WBGene00007489,WBGene00007624,WBGene00007675,WBGene00011059,...,WBGene00010989,WBGene00011964,WBGene00015735,WBGene00018878,WBGene00012290,WBGene00018310,WBGene00013152,WBGene00010100,WBGene00007307,WBGene00021106
"Ni_et_al_2016_13, 8, selectivity=0.125",,1.0,,,,,,,,,...,,,1.0,,,1.0,,,,
"Weiser_et_al_2017_28, 47, selectivity=0.02127659574468085",1.0,1.0,,1.0,,,,,,1.0,...,,1.0,,1.0,,1.0,,,,
"Ni_et_al_2016_11, 16, selectivity=0.0625",,1.0,,,1.0,,,,,,...,,,1.0,,,1.0,,,,
"Ni_et_al_2016_8, 24, selectivity=0.041666666666666664",,1.0,,1.0,,,,,,,...,,,,,,1.0,,,,
"Weiser_et_al_2017_25, 122, selectivity=0.00819672131147541",1.0,1.0,1.0,1.0,,,1.0,,1.0,1.0,...,,1.0,1.0,1.0,,1.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Reed_et_al_2020_11, 2995, selectivity=0.000333889816360601",,,,,,,,,,,...,,,,,,,,,,
"Gushchanskaia_et_al_2019_4, 5000, selectivity=0.0002",,,,,,,,,,,...,,,,,,1.0,,,1.0,1.0
"Chaves_et_al_2021_3, 3236, selectivity=0.00030902348578491963",,,,,,,,,,,...,,,,,,,,,,
"Quarato_et_al_2021_3, 1432, selectivity=0.0006983240223463687",,,,,,,,,,,...,,,,,,,,,,


In [10]:
# replacing all NAs with 0
df = df.fillna(0)

In [11]:
df.head()

Unnamed: 0,WBGene00016953,WBGene00017069,WBGene00021239,WBGene00007454,WBGene00008447,WBGene00022730,WBGene00007489,WBGene00007624,WBGene00007675,WBGene00011059,...,WBGene00010989,WBGene00011964,WBGene00015735,WBGene00018878,WBGene00012290,WBGene00018310,WBGene00013152,WBGene00010100,WBGene00007307,WBGene00021106
"Ni_et_al_2016_13, 8, selectivity=0.125",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"Weiser_et_al_2017_28, 47, selectivity=0.02127659574468085",1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
"Ni_et_al_2016_11, 16, selectivity=0.0625",0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"Ni_et_al_2016_8, 24, selectivity=0.041666666666666664",0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"Weiser_et_al_2017_25, 122, selectivity=0.00819672131147541",1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


## Gene Renaming

In [12]:
# creating a list of WBGene names
genes = df.columns

In [13]:
# reading in the Wormbase database to a dataframe
db = pd.read_csv('Wormbase_Gene_Sanitizer_Database_DownloadedOn_10-30-2023.txt', comment="#", sep="\t")

In [14]:
db

Unnamed: 0,Input,Status,Suggested Match,Public Name For,Sequence Name For,Molecular Name For,Other Name For,Split Into,Merged Into,History
0,WBGene00000001,Live - Unique,WBGene00000001,,,,,,,History: N.A.
1,WBGene00000002,Live - Unique,WBGene00000002,,,,,,,History: N.A.
2,WBGene00000003,Live - Unique,WBGene00000003,,,,,,,History: N.A.
3,WBGene00000004,Live - Unique,WBGene00000004,,,,,,,History: N.A.
4,WBGene00000005,Live - Unique,WBGene00000005,,,,,,,History: N.A.
...,...,...,...,...,...,...,...,...,...,...
257830,zyg-12,Live - Unique,WBGene00006997,WBGene00006997,,,,,,Public name for WBGene00006997. History: N.A.
257831,zyg-13,Live - Unique,WBGene00006998,WBGene00006998,,,,,,Public name for WBGene00006998. History: N.A.
257832,zyg-14,Live - Unique,WBGene00003916,,,,WBGene00003916,,,Other name for WBGene00003916(par-1). History:...
257833,zyx-1,Live - Unique,WBGene00006999,WBGene00006999,,,,,,Public name for WBGene00006999. History: N.A.


In [15]:
# converting WBGene names to common names
new_names = list()
for gene in genes:
    result = db.index[db["Public Name For"] == gene].item()
    new_names.append(db.iloc[result, 0])
print(new_names)

['C55C3.3', 'timm-17B.2', 'Y20F4.4', 'C08F11.7', 'E01G4.5', 'ZK402.3', 'C09G5.7', 'hrde-1', 'C18D4.6', 'R06C1.4', 'C38D9.2', 'F15D4.5', 'Y57G11C.51', 'pan-1', 'hil-4', 'cdk-1', 'T16G12.4', 'fbxb-97', 'F39F10.4', 'K09H9.7', 'tbb-2', 'saeg-1', 'W04B5.1', 'spe-41', 'csr-1', 'mif-2', 'ZK973.8', 'scrm-4', 'F39E9.7', 'K02E2.6', 'mcm-7', 'sea-2', 'W04B5.2', 'klp-7', 'rnh-1.3', 'fbxa-192', 'bath-45', 'ZK795.2', 'par-5', 'hsp-90', 'gly-13', 'klp-15', 'cey-2', 'C46G7.5', 'bath-13', 'citk-1', 'T03D3.5', 'E02H9.3', 'ZK909.3', 'H09G03.1', 'Y47H10A.5', 'C38C3.3', 'F58H7.5', 'F40D4.13', 'elf-1', 'vet-6', 'pyk-1', 'lin-15B', 'clp-6', 'T02G5.4', 'hsp-1', 'rme-2', 'ceh-20', 'T11F9.10', 'lido-18', 'Y17D7B.4', 'C04G6.6', 'C30G12.1', 'wago-1', 'cpg-1', 'T16G12.8', 'pgl-3', 'Y57G7A.5', 'K05C4.9', 'F09C8.2', 'wago-4', 'Y48G1BM.6', 'fkb-8', 'tba-2', 'dyf-3', 'Y105C5A.14', 'F55B11.6', 'Y37E11B.2', 'T20F7.1', 'gpx-8', 'his-24', 'qdpr-1', 'W05F2.4', 'R03H10.6', 'W09B7.2', 'R03D7.2', 'saeg-2', 'pdfr-1', 'glit-1',

In [16]:
# Save top 100 common gene names to a csv
with open('top_100.csv', 'w', newline='') as csv_file:
    csv_writer = csv.writer(csv_file)
    for item in new_names:
        csv_writer.writerow([item])

In [17]:
# changing the WBGene column names to common names
df.columns = new_names

In [18]:
df

Unnamed: 0,C55C3.3,timm-17B.2,Y20F4.4,C08F11.7,E01G4.5,ZK402.3,C09G5.7,hrde-1,C18D4.6,R06C1.4,...,R03D7.2,saeg-2,pdfr-1,glit-1,W05H12.2,F41G4.7,Y53F4B.5,F55C9.3,spch-1,W09B7.1
"Ni_et_al_2016_13, 8, selectivity=0.125",0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"Weiser_et_al_2017_28, 47, selectivity=0.02127659574468085",1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
"Ni_et_al_2016_11, 16, selectivity=0.0625",0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"Ni_et_al_2016_8, 24, selectivity=0.041666666666666664",0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
"Weiser_et_al_2017_25, 122, selectivity=0.00819672131147541",1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,...,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Reed_et_al_2020_11, 2995, selectivity=0.000333889816360601",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Gushchanskaia_et_al_2019_4, 5000, selectivity=0.0002",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0
"Chaves_et_al_2021_3, 3236, selectivity=0.00030902348578491963",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"Quarato_et_al_2021_3, 1432, selectivity=0.0006983240223463687",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Clustering

In [19]:
# creating an empty matrix to hold NMI values
NMI_matrix = pd.DataFrame(index=df.columns, columns=df.columns)

In [20]:
NMI_matrix

Unnamed: 0,C55C3.3,timm-17B.2,Y20F4.4,C08F11.7,E01G4.5,ZK402.3,C09G5.7,hrde-1,C18D4.6,R06C1.4,...,R03D7.2,saeg-2,pdfr-1,glit-1,W05H12.2,F41G4.7,Y53F4B.5,F55C9.3,spch-1,W09B7.1
C55C3.3,,,,,,,,,,,...,,,,,,,,,,
timm-17B.2,,,,,,,,,,,...,,,,,,,,,,
Y20F4.4,,,,,,,,,,,...,,,,,,,,,,
C08F11.7,,,,,,,,,,,...,,,,,,,,,,
E01G4.5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F41G4.7,,,,,,,,,,,...,,,,,,,,,,
Y53F4B.5,,,,,,,,,,,...,,,,,,,,,,
F55C9.3,,,,,,,,,,,...,,,,,,,,,,
spch-1,,,,,,,,,,,...,,,,,,,,,,


In [21]:
# Calculating NMI for each gene pair
for col1 in df.columns:
    for col2 in df.columns:
        NMI_matrix.loc[col1, col2] = normalized_mutual_info_score(df[col1], df[col2])

In [22]:
NMI_matrix

Unnamed: 0,C55C3.3,timm-17B.2,Y20F4.4,C08F11.7,E01G4.5,ZK402.3,C09G5.7,hrde-1,C18D4.6,R06C1.4,...,R03D7.2,saeg-2,pdfr-1,glit-1,W05H12.2,F41G4.7,Y53F4B.5,F55C9.3,spch-1,W09B7.1
C55C3.3,1.0,0.012817,0.077218,0.047166,0.003581,0.041302,0.145141,0.002292,0.09708,0.103088,...,0.027285,0.046231,0.004302,0.023342,0.002243,0.035474,0.015798,0.004886,0.100511,0.028119
timm-17B.2,0.012817,1.0,0.113406,0.232942,0.024525,0.007049,0.076712,0.007049,0.102202,0.108447,...,0.040253,0.158242,0.173559,0.197866,0.032834,0.127879,0.156392,0.001495,0.002339,0.099436
Y20F4.4,0.077218,0.113406,1.0,0.118955,0.023535,0.032769,0.035208,0.005047,0.210159,0.232052,...,0.02316,0.089686,0.059396,0.110586,0.019297,0.066345,0.042232,0.00189,0.00189,0.116917
C08F11.7,0.047166,0.232942,0.118955,1.0,0.018382,0.009544,0.066937,0.000267,0.164016,0.123334,...,0.015187,0.185793,0.154929,0.116123,0.00948,0.162843,0.099822,0.002325,0.000689,0.176955
E01G4.5,0.003581,0.024525,0.023535,0.018382,1.0,0.197519,0.050039,0.032023,0.054187,0.022734,...,0.004683,0.013565,0.024895,0.036476,0.299808,0.013565,0.010958,0.283211,0.000956,0.024053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F41G4.7,0.035474,0.127879,0.066345,0.162843,0.013565,0.009544,0.018936,0.00006,0.075721,0.096465,...,0.019058,0.310299,0.084106,0.115286,0.014356,1.0,0.240104,0.0,0.004779,0.271192
Y53F4B.5,0.015798,0.156392,0.042232,0.099822,0.010958,0.004694,0.058001,0.000493,0.044875,0.054416,...,0.045854,0.171827,0.047124,0.118652,0.014329,0.240104,1.0,0.00088,0.003253,0.152276
F55C9.3,0.004886,0.001495,0.00189,0.002325,0.283211,0.25773,0.039231,0.007008,0.024135,0.005693,...,0.008851,0.006508,0.008399,0.031683,0.299976,0.0,0.00088,1.0,0.020536,0.004235
spch-1,0.100511,0.002339,0.00189,0.000689,0.000956,0.004694,0.033762,0.007008,0.009624,0.008119,...,0.015508,0.004779,0.003612,0.002353,0.002063,0.004779,0.003253,0.020536,1.0,0.008295


In [23]:
# calculating the distance matrix from the NMI matrix
dist_matrix = 1 - NMI_matrix.astype(float)

In [24]:
dist_matrix

Unnamed: 0,C55C3.3,timm-17B.2,Y20F4.4,C08F11.7,E01G4.5,ZK402.3,C09G5.7,hrde-1,C18D4.6,R06C1.4,...,R03D7.2,saeg-2,pdfr-1,glit-1,W05H12.2,F41G4.7,Y53F4B.5,F55C9.3,spch-1,W09B7.1
C55C3.3,0.000000,0.987183,0.922782,0.952834,0.996419,0.958698,0.854859,0.997708,0.902920,0.896912,...,0.972715,0.953769,0.995698,0.976658,0.997757,0.964526,0.984202,0.995114,0.899489,0.971881
timm-17B.2,0.987183,0.000000,0.886594,0.767058,0.975475,0.992951,0.923288,0.992951,0.897798,0.891553,...,0.959747,0.841758,0.826441,0.802134,0.967166,0.872121,0.843608,0.998505,0.997661,0.900564
Y20F4.4,0.922782,0.886594,0.000000,0.881045,0.976465,0.967231,0.964792,0.994953,0.789841,0.767948,...,0.976840,0.910314,0.940604,0.889414,0.980703,0.933655,0.957768,0.998110,0.998110,0.883083
C08F11.7,0.952834,0.767058,0.881045,0.000000,0.981618,0.990456,0.933063,0.999733,0.835984,0.876666,...,0.984813,0.814207,0.845071,0.883877,0.990520,0.837157,0.900178,0.997675,0.999311,0.823045
E01G4.5,0.996419,0.975475,0.976465,0.981618,0.000000,0.802481,0.949961,0.967977,0.945813,0.977266,...,0.995317,0.986435,0.975105,0.963524,0.700192,0.986435,0.989042,0.716789,0.999044,0.975947
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F41G4.7,0.964526,0.872121,0.933655,0.837157,0.986435,0.990456,0.981064,0.999940,0.924279,0.903535,...,0.980942,0.689701,0.915894,0.884714,0.985644,0.000000,0.759896,1.000000,0.995221,0.728808
Y53F4B.5,0.984202,0.843608,0.957768,0.900178,0.989042,0.995306,0.941999,0.999507,0.955125,0.945584,...,0.954146,0.828173,0.952876,0.881348,0.985671,0.759896,0.000000,0.999120,0.996747,0.847724
F55C9.3,0.995114,0.998505,0.998110,0.997675,0.716789,0.742270,0.960769,0.992992,0.975865,0.994307,...,0.991149,0.993492,0.991601,0.968317,0.700024,1.000000,0.999120,0.000000,0.979464,0.995765
spch-1,0.899489,0.997661,0.998110,0.999311,0.999044,0.995306,0.966238,0.992992,0.990376,0.991881,...,0.984492,0.995221,0.996388,0.997647,0.997937,0.995221,0.996747,0.979464,0.000000,0.991705


In [25]:
# ensuring the distance matrix is symmetric
dist_matrix = (dist_matrix + dist_matrix.T) / 2

In [26]:
dist_matrix.to_csv("dist_matrix_100_full.csv")

In [27]:
print(dist_matrix)

             C55C3.3  timm-17B.2   Y20F4.4  C08F11.7   E01G4.5   ZK402.3  \
C55C3.3     0.000000    0.987183  0.922782  0.952834  0.996419  0.958698   
timm-17B.2  0.987183    0.000000  0.886594  0.767058  0.975475  0.992951   
Y20F4.4     0.922782    0.886594  0.000000  0.881045  0.976465  0.967231   
C08F11.7    0.952834    0.767058  0.881045  0.000000  0.981618  0.990456   
E01G4.5     0.996419    0.975475  0.976465  0.981618  0.000000  0.802481   
...              ...         ...       ...       ...       ...       ...   
F41G4.7     0.964526    0.872121  0.933655  0.837157  0.986435  0.990456   
Y53F4B.5    0.984202    0.843608  0.957768  0.900178  0.989042  0.995306   
F55C9.3     0.995114    0.998505  0.998110  0.997675  0.716789  0.742270   
spch-1      0.899489    0.997661  0.998110  0.999311  0.999044  0.995306   
W09B7.1     0.971881    0.900564  0.883083  0.823045  0.975947  0.978792   

             C09G5.7    hrde-1   C18D4.6   R06C1.4  ...   R03D7.2    saeg-2  \
C55C3.3 