Move taxon annotations over to main file

In [1]:
import pandas as pd

train_df = pd.read_csv("data/train/annotations.csv")
hierarchy_annotations = pd.read_csv("data/train/hierarchy_labels_train.csv")

train_df["id"] = train_df["path"].apply(lambda x: x.split("/")[-1].split(".")[0])
hierarchy_annotations["id"] = hierarchy_annotations["annotation_id"].apply(lambda x: x.split(".")[0])

# Enable this line if hierarchical ids are in the format "roi_image"
# hierarchy_annotations["id"] = hierarchy_annotations["id"].apply(lambda x: x.split("_")[-1] + "_" + x.split("_")[0])

# Make hierarchy_annotations headers lowercase
hierarchy_annotations.columns = hierarchy_annotations.columns.str.lower()
# Move domain,kingdom,phylum,class,order,family,genus,species from hierarchy_annotations to train_df based on id
# Create dictionary to map id to hierarchy annotations
hierarchy_annotations = hierarchy_annotations[["id", "domain", "kingdom", "phylum", "class", "order", "family", "genus", "species"]]
hierarchy_dict = hierarchy_annotations.set_index("id").T.to_dict("list")

train_df["domain"] = train_df["id"].map(lambda x: hierarchy_dict[x][0] if x in hierarchy_dict else None)
train_df["kingdom"] = train_df["id"].map(lambda x: hierarchy_dict[x][1] if x in hierarchy_dict else None)   
train_df["phylum"] = train_df["id"].map(lambda x: hierarchy_dict[x][2] if x in hierarchy_dict else None)
train_df["class"] = train_df["id"].map(lambda x: hierarchy_dict[x][3] if x in hierarchy_dict else None)
train_df["order"] = train_df["id"].map(lambda x: hierarchy_dict[x][4] if x in hierarchy_dict else None)
train_df["family"] = train_df["id"].map(lambda x: hierarchy_dict[x][5] if x in hierarchy_dict else None)
train_df["genus"] = train_df["id"].map(lambda x: hierarchy_dict[x][6] if x in hierarchy_dict else None)
train_df["species"] = train_df["id"].map(lambda x: hierarchy_dict[x][7] if x in hierarchy_dict else None)

# Drop id column from train_df
train_df = train_df.drop(columns=["id"])

# Turn hierarchy annotations in train_df from floats to ints and then to strings if they are not null

train_df["domain"] = train_df["domain"].apply(lambda x: str(int(x)) if pd.notnull(x) else x)
train_df["kingdom"] = train_df["kingdom"].apply(lambda x: str(int(x)) if pd.notnull(x) else x)
train_df["phylum"] = train_df["phylum"].apply(lambda x: str(int(x)) if pd.notnull(x) else x)
train_df["class"] = train_df["class"].apply(lambda x: str(int(x)) if pd.notnull(x) else x)
train_df["order"] = train_df["order"].apply(lambda x: str(int(x)) if pd.notnull(x) else x)
train_df["family"] = train_df["family"].apply(lambda x: str(int(x)) if pd.notnull(x) else x)
train_df["genus"] = train_df["genus"].apply(lambda x: str(int(x)) if pd.notnull(x) else x)
train_df["species"] = train_df["species"].apply(lambda x: str(int(x)) if pd.notnull(x) else x)

# Save train_df to csv
train_df.to_csv("data/train/annotations.csv", index=False)

Get hierarchy and hierarchy dict without moving annotations

In [None]:
import json
import numpy as np 
import pandas as pd

from utils.utils import get_hierarchy_from_df

train_df = pd.read_csv("data/train/annotations.csv")
hierarchy_dict, descendent_matrix = get_hierarchy_from_df(train_df)

print("Hierarchy dict: ", hierarchy_dict)
print("Descendent matrix: ", descendent_matrix)

# Save hierarchy dict to json files
with open("cfg/hierarchy/hierarchy_dict.json", "w") as f:
    json.dump(hierarchy_dict, f)

# Save descendent matrix as numpy array
np.save("cfg/hierarchy/descendent_matrix.npy", descendent_matrix)

Check for intra-taxon annotations

In [20]:
import json
import pandas as pd

heirarchical_headers = ["phylum", "class", "order", "family", "genus", "species"]

train_df = pd.read_csv("data/train/annotations.csv")
index_to_taxon_map = json.load(open("data/train/index_to_taxon.json"))

original_annotations_set = set(train_df["label"].unique())
assert len(original_annotations_set) == 79

# For each row, collect the values of the heirarchical headers into a list if they are not null
def collect_hierarchy(row):
    return [row[header] for header in heirarchical_headers if pd.notnull(row[header])]

# Create a new column "hierarchy" in train_df that contains the list of hierarchy values
train_df["hierarchy"] = train_df.apply(collect_hierarchy, axis=1)

train_df["last_nodes"] = train_df["hierarchy"].apply(lambda x: x[-1] if len(x) > 0 else None)

train_df["converted_last_nodes"] = train_df["last_nodes"].apply(
    lambda x: index_to_taxon_map[str(int(x))]
    )

converted_annotations_set = set(train_df["converted_last_nodes"].unique())

print("Annotations in original but not in converted: ", original_annotations_set - converted_annotations_set)
print("Annotations in converted but not in original: ", converted_annotations_set - original_annotations_set)


# For each id in train_df, check if the converted last node matches the label exactly
def check_label(row):
    return row["converted_last_nodes"] == row["label"]

train_df["label_check"] = train_df.apply(check_label, axis=1)
# Check if there are any rows where the label check is False
label_check_false = train_df[train_df["label_check"] == False]
print("Number of rows where label check is False: ", len(label_check_false))

Annotations in original but not in converted:  set()
Annotations in converted but not in original:  set()
Number of rows where label check is False:  0


Drop Domain and Kingdom

In [None]:
import json
import pandas as pd

train_df = pd.read_csv("data/train/annotations.csv")
index_to_taxon_map = json.load(open("data/train/index_to_taxon.json"))
taxon_to_index_map = json.load(open("data/train/taxon_to_index.json"))

heirarchical_headers = ["domain", "kingdom", "phylum", "class", "order", "family", "genus", "species"]

# Drop Domain and Kingdom, subtract 1 from all other indices
# First deal with train_df

train_df.drop(columns=["domain", "kingdom"], inplace=True)

for header in heirarchical_headers[2:]:
    train_df[header] = train_df[header].apply(lambda x: str(int(x) - 2) if pd.notnull(x) else x)

print("train_df after dropping domain and kingdom: ")
# print first 5 rows of train_df
print(train_df.head())

# Now deal with index_to_taxon_map
new_index_to_taxon_map = {}
for key, value in index_to_taxon_map.items():
    if key == "0" or key == "1":
        continue
    else:
        new_index_to_taxon_map[str(int(key) - 2)] = value

# Now deal with taxon_to_index_map
new_taxon_to_index_map = {}
for key, value in taxon_to_index_map.items():
    if value == "Eukaryote" or value == "Anamalia":
        continue
    else:
        new_taxon_to_index_map[key] = str(int(value) - 2)


# Save new index_to_taxon_map and taxon_to_index_map to json
with open("data/train/index_to_taxon.json", "w") as f:
    json.dump(new_index_to_taxon_map, f)
with open("data/train/taxon_to_index.json", "w") as f:
    json.dump(new_taxon_to_index_map, f)

# Save train_df to csv
train_df.to_csv("data/train/annotations.csv", index=False)

train_df after dropping domain and kingdom: 
                        path                    label phylum class order  \
0  ./data/train/rois/1_1.png             Sebastolobus      2    23    45   
1  ./data/train/rois/2_2.png  Apostichopus leukothele      4    17    53   
2  ./data/train/rois/3_3.png              Scotoplanes      4    17    35   
3  ./data/train/rois/4_4.png               Keratoisis      3    20    52   
4  ./data/train/rois/4_5.png               Keratoisis      3    20    52   

  family genus species  
0     99   155     NaN  
1    101   113     160  
2     70   153     NaN  
3     77   131     NaN  
4     77   131     NaN  


Make similar adjustments to descendent matrix and hierarchy dict

In [19]:
import numpy as np
import json

descendent_matrix = np.load("cfg/hierarchy/descendent_matrix.npy")
hierarchy_dict = json.load(open("cfg/hierarchy/hierarchy_dict.json"))

print("descendent_matrix shape: ", descendent_matrix.shape)
print("descendent_matrix: ", descendent_matrix)

# Drop first two rows and columns
new_descendent_matrix = descendent_matrix[2:, 2:]

# Assert that the new descendent matrix rows and columns are the same as the original descendent matrix for the new ranges
for row in range(new_descendent_matrix.shape[0]):
    for col in range(new_descendent_matrix.shape[1]):
        assert new_descendent_matrix[row, col] == descendent_matrix[row + 2, col + 2]

# Save new hierarchy_dict to file
# Drop 0 and 1 keys from hierarchy_dict along with their values
# Subtract 2 from all other keys and values
# Order the keys and values in new_hierarchy_dict
new_hierarchy_dict = {}
for key, value in hierarchy_dict.items():
    if key == "0" or key == "1":
        continue
    else:
        new_hierarchy_dict[str(int(key) - 2)] = [str(int(x) - 2) for x in value]

# Save new descendent matrix to file
np.save("cfg/hierarchy/descendent_matrix.npy", new_descendent_matrix)

# Save new hierarchy_dict to json
with open("cfg/hierarchy/hierarchy_dict.json", "w") as f:
    json.dump(new_hierarchy_dict, f)

descendent_matrix shape:  (194, 194)
descendent_matrix:  [[1 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
