# Plainsight data cleaning

In [1]:
import json
import pandas as pd
import numpy as np

import pprint
import statistics
import typing

from pathlib import Path
from itertools import chain

In [2]:
base_source = Path('/Users/c140-admin/Documents/03 Gleason/raw_data') # base directory for raw data
data_source = base_source / 'explanation_annotations (Plainsight raw data)' # directory containing the Plainsight data
img_source = Path('/Users/c140-admin/Documents/03 Gleason/data') # image and additional file source directory

### Annotation cleaning methods

In [3]:
def group_bfill(group):
    return group.bfill()

def group_ffill(group):
    return group.ffill()

In [4]:
def get_annotator_group(annotator_folder_name):
  annotator_group = annotator_folder_name.split('_')[-3:]
  if annotator_group[0] == 'E':
    annotator_group = '.'.join(annotator_group[1:])
  elif not annotator_group[1].isnumeric():
    annotator_group = annotator_group[2]
  elif not annotator_group[2].isnumeric():
    annotator_group = '.'.join(annotator_group)
  else:
    annotator_group = ('.').join(annotator_group[1:])
  return annotator_group


In [5]:
def get_annotations_and_coords(annotations, annotator):
  comment_exists = False

  # initialise the dataframes
  explanations_df = pd.DataFrame(columns = ["annotator", "TMA", "grade", "coords", "explanations", "group"])
  comments_df = pd.DataFrame(columns = ["annotator", "TMA", "grade", "comments"])

  num_TMAs = len(annotations[annotator]['labels'])

  annotator_group = get_annotator_group(annotator)


  # iterate over each TMA the annotator worked on
  for TMA_index in range(num_TMAs):

      TMA_name = annotations[annotator]['labels'][TMA_index]['dataId'][:-5]
      base_path_to_data = annotations[annotator]['labels'][TMA_index]["annotations"]
      grades = list(base_path_to_data.keys())

      # iterate over each possible grade per TMA (3, 4 and 5)
      for grade in grades:
          if grade in base_path_to_data:
              # determine the number of explanation polygons in the grade and iterate over them
              num_polygons = len(base_path_to_data[grade])
              for i in range(num_polygons):
                  # flag to keep track whether a textual explanation was chosen
                  text_exists = False

                  # retrieve the polygon coordinates
                  polygon_coords = base_path_to_data[grade][i]["data"]["points"]
                  explanation_keys = base_path_to_data[grade][i]["children"].keys()
                  
                  # retrieve the textual explanations
                  for key in explanation_keys:

                      path_to_textual_explanations = base_path_to_data[grade][i]["children"][key]["data"]

                      # check if there is an additional comment
                      if type(path_to_textual_explanations)==str:
                          comments_row = [annotator, TMA_name, grade, path_to_textual_explanations]
                          comments_df.loc[len(comments_df)] = comments_row
                          comment = path_to_textual_explanations
                          if not comment == "":
                              comment_exists = True
                      # if only none (which is the default value) it is empty
                      else:
                          if path_to_textual_explanations['selected'] in [['none'], 'none', [], ['Bitte wählen Sie eine passende Erklärung aus.']]:
                              continue
                          else:
                              text_exists = True
                              valid_key = key
                              textual_explanation = path_to_textual_explanations['selected']

                              # clean the textual explanations
                              if type(textual_explanation)==list and "none" in textual_explanation:
                                  textual_explanation.remove("none")
                              if type(textual_explanation)==list and "Bitte wählen Sie eine passende Erklärung aus." in textual_explanation:
                                  textual_explanation.remove("Bitte wählen Sie eine passende Erklärung aus.")
                              if type(textual_explanation)==list and "Bitte wählen Sie mindestens eine Erklärung aus." in textual_explanation:
                                  textual_explanation.remove("Bitte wählen Sie mindestens eine Erklärung aus.")
                              if path_to_textual_explanations['selected'] in [['none'], 'none', [], ['Bitte wählen Sie eine passende Erklärung aus.']]:
                                  continue

                              if valid_key not in ['Andere Erklärung', 'Erklärungen Gleason Grad 3',
                                                   'Erklärungen Gleason Grad 4', 'Erklärung Gleason Grad 5',
                                                   "Erklärungen Gleason 3", "Erklärungen Gleason 4",
                                                   "Erklärungen Gleason 5", "Erklärung Gleason 5", "Erklärung",
                                                  "Erklärung Gleason 4", "Erklärungen", "Explanation for Gleason 3",
                                                  "Another explanation", "Explanation for Gleason 4", "Explanation for Gleason 5",
                                                  "Explanation Gleason 5", "Explanations for Gleason 3",
                                                  "Explanation  for Gleason 5", "Explanations Gleason 3",
                                                  "Explanations Gleason 4", "Explanations Gleason 5", "Explanation Gleason 4",
                                                  "Explanation Gleason 3", "Explanations gleason 3", "Explanations gleason 4",
                                                  "Explanations gleason 5", "Explanation gleason 5", "Explanations Glaason 4"]:
                                  if type(textual_explanation)==list:
                                      textual_explanation = valid_key + " " + textual_explanation[0]
                                  else:
                                      textual_explanation = valid_key + " " + textual_explanation
                              else:
                                  if type(textual_explanation)==list:
                                      textual_explanation = textual_explanation[0]

                          # more cleaning
                          if textual_explanation == "Bitte wählen Sie mindestens eine Erklärung aus.":
                              textual_explanation = "No textual explanation given"
                          if textual_explanation.endswith("Bitte wählen Sie mindestens eine Erklärung aus."):
                              textual_explanation = textual_explanation[:-len("Bitte wählen Sie mindestens eine Erklärung aus.")].strip()
                          if textual_explanation.endswith(" Bitte wählen Sie eine p"):
                              textual_explanation = textual_explanation[:-len(" Bitte wählen Sie eine p")].strip()
                          if textual_explanation.endswith(" Bitte wählen Sie eine passende Erklärung aus."):
                              textual_explanation = textual_explanation[:-len(" Bitte wählen Sie eine passende Erklärung aus.")].strip()
                          if textual_explanation.endswith(" none"):
                              textual_explanation = textual_explanation[:-len("none")].strip()
                          if textual_explanation == "":
                              textual_explanation = "No textual explanation given"
                          if textual_explanation == []:
                              textual_explanation = "No textual explanation given"

                          # if there is both an explanation and a comment, put them together
                          if comment_exists:
                              textual_explanation = textual_explanation + ".\n Free text: " + comment

                          # add the polygon and explanation data to the dataframe
                          row = [annotator, TMA_name, grade, polygon_coords, textual_explanation, annotator_group]
                          explanations_df.loc[len(explanations_df)] = row

                  # handle the case of no textual explanation chosen
                  if not text_exists:
                      if comment_exists:
                          textual_explanation = "Free text: " + comment
                      else:
                          textual_explanation = "No textual explanation given"
                      row = [annotator, TMA_name, grade, polygon_coords, textual_explanation, annotator_group]
                      explanations_df.loc[len(explanations_df)] = row

                  # set the comment flag back
                  comment_exists = False
  return explanations_df, comments_df

## Export Test & Validation Files

In [6]:
with open(img_source / "label_remapping.json", "r") as f:
        label_mapping = json.load(f)

with open(img_source / "free_text_mapping.json", "r") as f:  
        free_text_mapping = json.load(f)

In [7]:
label_ranks = ["variable sized well-formed individual and discrete glands",
                                "compressed or angular discrete glands",
                                "poorly formed and fused glands",
                                "Cribriform glands",
                                "Glomeruloid glands",
                                "solid groups of tumor cells",
                                "cords",
                                "single cells",
                                "presence of comedonecrosis",]
label_grade = {"variable sized well-formed individual and discrete glands": 3,
                                "compressed or angular discrete glands": 3,
                                "poorly formed and fused glands": 4,
                                "Cribriform glands": 4,
                                "Glomeruloid glands": 4,
                                "solid groups of tumor cells": 5,
                                "cords": 5,
                                "single cells": 5,
                                "presence of comedonecrosis": 5,}

In [8]:
def clean_explanation(remap_label, language):
    # Helper function to split the explanation in our actual given explanation and the free text.
    # Further removes errors of the explanation string for example new lines, double spaces etc.

    def split_explanations(input_string):
        chars_to_strip = " .\n\r"

        if pd.notna(input_string):
            input_string = input_string.replace("\n", "")
            input_string = input_string.replace("\r", "")
            input_string = input_string.replace(
                "  ", " ")  # Replace double whitespace

            if "Free text:" in input_string:
                parts = input_string.split('Free text:', 1)
                text_before = parts[0].strip(chars_to_strip)
                text_after = parts[1].strip(chars_to_strip)
                if text_before == "":
                    text_before = np.nan
            else:
                text_before = input_string.strip(chars_to_strip)
                text_after = np.nan

            if pd.notna(text_before):
                text_before = text_before.strip(chars_to_strip)
                text_before = text_before.lower()

            if pd.notna(text_after):
                text_after = text_after.strip(chars_to_strip)
                text_after = text_after.lower()

            return text_before, text_after
        else:
            return np.nan, np.nan

    expl, freetext = split_explanations(remap_label)

    remap_of_labels_german = label_mapping["german_errors"]
    remap_of_labels_english = label_mapping["english_errors"]
    german_to_english_map = label_mapping["translated"]
    label_hierarchy = label_mapping["hierarchy"]

    if language:
        if expl in remap_of_labels_english:
            expl = remap_of_labels_english[expl]
    else:
        if expl in remap_of_labels_german:
            expl = remap_of_labels_german[expl]
        
        if expl in german_to_english_map:
            expl = german_to_english_map[expl]
    
    if freetext in free_text_mapping:
        freetext = free_text_mapping[freetext]

    if str(expl) == 'nan':
        expl = freetext

    # Get rid of Gleason levels
    label_hierarchy = {new_exp: old_exp for _, gleason_grade_exps in label_hierarchy.items(
    ) for new_exp, old_exp in gleason_grade_exps.items()}

    label_hierachry_remapping = {}
    for new_exp, old_exps in label_hierarchy.items():
        for old_exp in old_exps:
            label_hierachry_remapping[old_exp] = new_exp
    
    if expl in label_hierachry_remapping:
        return label_hierachry_remapping[expl]
    else:
        print("expl not in mapping:", expl)
        print("maybe freetext:", freetext)
        return label_hierachry_remapping[freetext]
    


In [9]:
def get_sorted(explanations: typing.List[str], english: bool):
    ranks = []
    for expl in explanations:
        mapped_expl = clean_explanation(expl, english)
        ranks.append(label_ranks.index(mapped_expl))
        
    exp_rank_pairs = list(zip(explanations, ranks))
    sorted_exp = sorted(exp_rank_pairs, key=lambda x: x[1])
    sorted_exp = [label for label,_ in sorted_exp]

    return sorted_exp
        

In [10]:
def get_double_as_df(df: pd.DataFrame, doubles_df: pd.DataFrame):
    # df: full df to get imputed -> WILL BE CHANGED in method
    # doubles_df: rows of the df which are multi-select polygons

    extention_df = []
    indices = []

    doubles_df = doubles_df.astype("string")
    for _, group in doubles_df.groupby(["coords"]):
        indices.append(group.index)
    
    annotator = df["annotator"].unique()[0]
    english = True if 'E' in annotator.split('_') else False
    english = False if annotator == "Karl_Karlson_E_3_5" else english

    for poly in indices: # for each multiselected polygon
        explanations = []
        for idx in poly:
            explanations.append(df.iloc[idx]['explanations'])

        sorted_explanations = get_sorted(explanations, english)

        for num, idx in enumerate(poly):
             df.at[idx, 'explanations'] = sorted_explanations[num]
        
        current_TMA = df.iloc[idx]['TMA']
        
        first_id = poly[0]
        last_id = first_id
        first_id -= 1
        while df.iloc[first_id]['imputed'] == True and df.iloc[first_id]['TMA'] == current_TMA: # stay within TMA
            first_id -= 1
        first_id += 1

        if first_id != last_id:
            idx_adds = [i / len(explanations) for i in range(0, len(explanations))]
            for i in range(first_id, last_id): # for each to impute
                current_row = df.iloc[i]
                df.at[i, 'explanations'] = sorted_explanations[0] # set the zeroth explanation

                for j, expl_to_add in enumerate(sorted_explanations[1:]): # generate rows for the other explanations
                    new_row = current_row.copy()
                    new_row['explanations'] = expl_to_add
                    new_row.name = int(new_row.name) + idx_adds[j+1] # set index of the rows to add so they are included after idx sort after original
                    extention_df.append(new_row)
    return extention_df


In [None]:
drop_annotator = ['first_half_Victoria_Victoriason_E_19_1', 'Ann_Annson_3_4', 'Joe_Joeson_E_3_2']

def get_dfs(folder_name, file_names):
  dupes = 0
  collected_explanations = []

  annotator = folder_name.name
  if annotator in drop_annotator:
    print("dropped:", annotator)
    return (pd.DataFrame([]), pd.DataFrame([]), pd.DataFrame([])), 0
  
  for fname in file_names:
    t_file = folder_name / fname

    with open(t_file, 'r') as data_file:
      data = data_file.read()
      json_data = json.loads(data)

      annotations = {}
      annotations[annotator] = json_data
      explanations_df, _ = get_annotations_and_coords(annotations, annotator)

      firsts = explanations_df["coords"].duplicated(keep="first")
      duplicates = explanations_df["coords"].duplicated(keep=False)

      dupe_indices = explanations_df[firsts | duplicates].index
      dupes += duplicates.sum()

      explanations_df = explanations_df.replace("No textual explanation given", np.nan)
      explanations_df['imputed'] = explanations_df['explanations'].isna()
      explanations_df["explanations"] = explanations_df.groupby('TMA', group_keys=False)['explanations'].apply(group_bfill)

      if len(dupe_indices) > 0:
        print("before", len(explanations_df))
        cat_dfs = get_double_as_df(explanations_df, explanations_df[firsts | duplicates])

        extention_df = pd.DataFrame(cat_dfs)
        explanations_df = pd.concat([explanations_df, extention_df], ignore_index=False)
        
        explanations_df = explanations_df.sort_index().reset_index(drop=True)
        print("after", len(explanations_df))

      collected_explanations.append(explanations_df)

  print("total dupes", dupes)
  print("---")
  return collected_explanations, dupes



def get_test_and_val_files(data_source):
  test_dfs = {}
  validation_dfs = {}
  train_dfs = {}
  file_names = ["test.json", "validation.json", "train.json"]

  total_dupes = 0
  for folder in data_source.iterdir():
    if folder.is_dir() and not folder.name.startswith('.') and not folder.name.startswith('tabular'):
      folder_name = str(folder.name)

      (test_dfs[folder_name], validation_dfs[folder_name], train_dfs[folder_name]), dupes = get_dfs(folder, file_names)
      total_dupes += dupes

  test_df = pd.concat(test_dfs, ignore_index=True)
  validation_df = pd.concat(validation_dfs, ignore_index=True)
  train_df = pd.concat(train_dfs, ignore_index=True)
  return test_df, validation_df, train_df, total_dupes

test_df, val_df, train_df, dupes = get_test_and_val_files(data_source)

print('----')
print("dupes", dupes)

test_df.to_csv(data_source / 'test_df.csv')
val_df.to_csv(data_source / 'val_df.csv')
train_df.to_csv(data_source / 'train_df.csv')

print("val")
print(val_df["explanations"].isna().sum())
print(len(val_df["explanations"]))

print("test")
print(test_df["explanations"].isna().sum())
print(len(test_df["explanations"]))

print("train")
print(train_df["explanations"].isna().sum())
print(len(train_df["explanations"]))

all_data = pd.concat([train_df, val_df, test_df]).reset_index(drop=True)
all_data.to_csv(img_source / 'explanations_df.csv')
print("total", len(all_data))

In [None]:
def dataframe_nan_count(base_path):
  file_path = img_source / 'explanations_df.csv'
  df = pd.read_csv(file_path)
  print(len(df["explanations"]))
  print(df["explanations"].isna().sum())

dataframe_nan_count(base_source)

28529
422
422

In [13]:
all_data = pd.read_csv(img_source / 'explanations_df.csv') 

In [14]:
def shortener(x: str) -> str:
    x_split = x.split('_')
    if len(x_split) == 2:
        part_one = x_split[0]
        part_two = x_split[1].split('.')[0]
        return (part_one + '_' + part_two)
    else:
        return '_'.join(x.split('_')[:-1])

In [15]:
all_data['TMA_identifier'] = all_data['TMA'].apply(shortener)

In [None]:
print(len(all_data))
print(all_data[['TMA_identifier', 'annotator']].groupby('TMA_identifier').nunique())
unique_count = all_data[['TMA_identifier', 'annotator']].groupby('TMA_identifier').nunique()
print('--')
print(unique_count['annotator'].value_counts())

In [None]:
drop_images = ['PR1001_D7', 'PR1921c_B15', 'PR1921c_B16']

print("len:", len(all_data.groupby(['TMA_identifier'])))
all_data = all_data[~all_data['TMA_identifier'].isin(drop_images)]
print("lena:", len(all_data.groupby(['TMA_identifier'])))

In [None]:
drop_three_annotator_TMAs = []
for name, series in all_data.groupby('TMA_identifier'):
    if len(series['annotator'].unique()) != 3:
        drop_three_annotator_TMAs.append(name)
print(drop_three_annotator_TMAs)
print("len", len(drop_three_annotator_TMAs))

In [None]:

print(f"{len(all_data)} in {len(all_data.groupby(['TMA_identifier']))}")
all_data = all_data[~all_data['TMA_identifier'].isin(drop_three_annotator_TMAs)]
print(f"{len(all_data)} in {len(all_data.groupby(['TMA_identifier']))}")

unique_count = all_data[['TMA_identifier', 'annotator']].groupby('TMA_identifier').nunique()
print('--')
print(unique_count['annotator'].nunique())

In [None]:
all_data = all_data.drop(columns=['TMA_identifier'])
all_data.to_csv(img_source / "shortened_df.csv")
print(all_data)