## Get the names of the hospitals from the original `.mat` files

In [None]:
from pathlib import Path
import os
import re
import typing
from typing import List
import pandas as pd
from pandas import DataFrame
import hashlib
import shutil
from random import shuffle
import numpy as np
from tqdm import tqdm

In [None]:
path = Path("/storage_1/003_raw_gbm_met_classifier/")

In [None]:
# split names into two groups(new and old naming conventions), by the 3d symbol in the name: '_' or '-'
# we process fnames from different hospitals in different ways
new_conv = set()
old_conv = set()
for root, dirs, files in os.walk(path):
    for name in files:
        if name[3] == '_':
            new_conv.add(name)
        elif name[3] == '-':
            old_conv.add(name)
#         print(os.path.join(root, name))
#         print(name)

# output our groups
def print_group(group):
    for el in group:
        print(el)

# print("New convention:")
# print_group(new_conv)

# print("Old convention:")
# print_group(old_conv)

In [None]:
hospitals = set()

# processing `group_1`
for fname in new_conv:
    tmp = fname.split("_")[2]
    hospital = re.search(r"[a-zA-Z]*", tmp).group()
    hospitals.add(hospital)
    
# processing `group_2`
for fname in old_conv:
    hospital = fname.split("-")[1][:2]
    hospitals.add(hospital)

print(hospitals)

In [None]:
def get_hospital_from_filename(fpath:Path) -> str:
    """
    Extracts hospital label from filename. Assume filename in one from two file formats.
    Ignores files with extension different from `.mat`.
    """
    fpath = Path(fpath)
    # we process only `.mat` files
    if fpath.suffix != ".mat":
        return None
    fname = fpath.stem
    # detect one of two possible name conventions
    if fname[3] == '_':
        tmp = fname.split("_")[2]
        label = re.search(r"[a-zA-Z]*", tmp).group()
        return label.upper()
    elif fname[3] == '-':
        label = fname.split("-")[1][:2]
        return label.upper()
    else:
        raise ValueError("Unsupported filename format: {}".format(fpath))

In [None]:
for root, dirs, files in os.walk(path):
    for name in files:
        print(os.path.join(root, name), '->', get_hospital_from_filename(name))

## Replace labels 'CREAL' to 'CR' in the names of the `.mat` files

In [None]:
old_label = r"CREAL"
new_label = "CR"
for root, dirs, files in os.walk(path):
    for name in files:
        full_path = os.path.join(root, name)
        if get_hospital_from_filename(full_path) == old_label:
            new_name = tmp[0] + "_" + tmp[1] + "_" + re.sub(old_label, new_label, tmp[2]) + "_" +  tmp[3]
            os.rename(full_path, os.path.join(root, new_name))

## Check if we have duplicates
Iterate over all files and count occurence of each of the filename in the folder. Print duplicates.

In [None]:
d_count = dict()
for root, dirs, files in os.walk(path):
    for fname in files:
        if fname in d_count:
            d_count[fname] += 1
        else:
            d_count[fname] = 1

# print occurences more than 1 time and add to set of duplicate names
duplicates = set()
for key in d_count:
    if d_count[key] > 1:
        print(key, ":", d_count[key])
        duplicates.add(key)

In [None]:
names = dict()
for root, dirs, files in os.walk(path):
    for fname in files:
        if fname in names:
            names[fname].append(os.path.join(root, fname))
        else:
            tmp = [os.path.join(root, fname)]
            names[fname] = tmp

In [None]:
for fname in duplicates:
    print(fname)
    for el in names[fname]:
        print(el)
    print("-" * 100)

Check if the files are the same, or we have different files with the same names(md5).

In [None]:
def file_as_bytes(file):
    with file:
        return file.read()

for root, dirs, files in os.walk(path):
    for fname in files:
        if fname in duplicates:
            full_path = os.path.join(root, fname)
            print(full_path)
            print(fname, hashlib.md5(file_as_bytes(open(full_path, 'rb'))).hexdigest())

So, files are different - different masks. We will keep all of them.

## Rename duplicates

In [None]:
gbm_counter = 0
for root, dirs, files in os.walk(path / "gbm"):
    gbm_counter += len(files)
print("Number of 'gbm' '.mat' files: {}.".format(gbm_counter))

In [None]:
met_counter = 0
for root, dirs, files in os.walk(path / "met"):
    met_counter += len(files)

met_counter -= (len(duplicates) + 1)
print("Number of 'met' '.mat' files: {}.".format(met_counter))

In [None]:
print("{}".format(met_counter / gbm_counter))

Assume we decided to keep all copies of the data. Then we have to rename duplicated filenames not to overwrite `.png` images during process of generation. So, iterate over all files, create dictionary of filename counters and in the case if we have already such filename in the dictionary - modify name of the current file by adding `"_copy_<value in  dictionary>"` before the file extension. Increase current value of the dictionary by 1. After such modification it is necessary to recalculate `.png` files.

In [None]:
d_count = dict()
for root, dirs, files in os.walk(path):
    for fname in files:
        if fname in d_count:
            name, ext = os.path.splitext(fname)
            new_fname = name + "_copy_{}".format(d_count[fname]) + ext
            shutil.move(os.path.join(root, fname), os.path.join(root, new_fname))
            d_count[fname] += 1
        else:
            d_count[fname] = 1

In [None]:
d_count = dict()
for root, dirs, files in os.walk(path):
    for fname in files:
        if fname in d_count:
            d_count[fname] += 1
        else:
            d_count[fname] = 1

# print occurences more than 1 time and add to set of duplicate names
duplicates = set()
for key in d_count:
    if d_count[key] > 1:
        print(key, ":", d_count[key])
        duplicates.add(key)

In [None]:
duplicates

## Organize `train` / `valid` splitting of the data with respect to the hospital
First we need to create Pandas DataFrame which would contain information about hospital and class of the tumor at each of the files.

In [None]:
def _get_labels_from_imagenet_like_folder(root:Path) -> List:
    """
    Returns names of folders in the folder.
    """
    result = []
    for el in os.listdir(root):
        # if doesn't have extension - folder
        if len(el.split('.')) == 1:
            result.append(el)
    return result

In [None]:
_get_labels_from_imagenet_like_folder(path)

In [None]:
def _create_df_from_folder(path:Path) -> DataFrame:
    """
    Creates dataframe with information about each `.mat` file in the `path` tree.
    """
    d_hospitals = dict()
    labels = _get_labels_from_imagenet_like_folder(path)
    for label in labels:
        for root, dirs, files in os.walk(path / label):
            # create necessary dictionaries
            for fname in files:
                if Path(fname).suffix == ".mat":
                    key = os.path.splitext(fname)[0]
                    d_hospitals[key] = (get_hospital_from_filename(fname), label, os.path.join(root, fname))
    df = pd.DataFrame.from_dict(d_hospitals, orient='index', columns=['hospital', 'label', 'path'])
    return df

In [None]:
df = _create_df_from_folder(path)
df.head()

In [None]:
df.to_csv(path / "split.csv")

In [None]:
df_2 = pd.read_csv(path / "split.csv", index_col=0)
df_2.head()

We need to split dataset into `train` (80 %) and `valid` (20 %) but in such way that 

In [None]:
df.groupby('hospital').count()

In [None]:
df.groupby(['hospital', 'label']).count()

In [None]:
df.tail()

In [None]:
df.hospital.unique()

In [None]:
def _split_df_to_train_valid(df:DataFrame, pct:float = 0.8) -> DataFrame:
    """
    Generates DataFrame with new `data_split` column with values from {"train", "valid"}.
    From each hospital for each category randomly assign `pct` labels of "train", the rest are "valid".
    """
    result = df.copy(deep=True)
    result["data_split"] = [None] * result.shape[0]
    for hospital in result.hospital.unique():
        # split by hospital
        tmp = result[result["hospital"] == hospital]
        for label in tmp.label.unique():
            # split by label in the hospital
            tmp_per_label = tmp[tmp['label'] == label]
            # index till which to split to `train`
            index = np.random.permutation(tmp_per_label.index)
            for idx_el in index[:int(pct * len(index))]:
                # modify the whole dataframe
                result.loc[[idx_el], ['data_split']] = 'train'
            for idx_el in index[int(pct * len(index)):]:
                result.loc[[idx_el], ['data_split']] = 'valid'
    return result

In [None]:
df_2 = _split_df_to_train_valid(df)
df_2.head()

In [None]:
# make data splitting in dataframe
pct = 0.8
result = df.copy(deep=True)
result["data_split"] = [None] * result.shape[0]
for hospital in result.hospital.unique():
    # split by hospital
    tmp = result[result["hospital"] == hospital]
    for label in tmp.label.unique():
        # split by label in the hospital
        tmp_per_label = tmp[tmp['label'] == label]
        # index till which to split to `train`
        index = np.random.permutation(tmp_per_label.index)
        for idx_el in index[:int(pct * len(index))]:
            # modify the whole dataframe
            result.loc[[idx_el], ['data_split']] = 'train'
        for idx_el in index[int(pct * len(index)):]:
            result.loc[[idx_el], ['data_split']] = 'valid'

In [None]:
result

In [None]:
df

Sanity check:

In [None]:
df[["path", "data_split"]]

In [None]:
tmp_lst = list(range(10))
shuffle(tmp_lst)
pivot_idx = int(len(tmp_lst) *  0.8)
tmp_lst_train = tmp_lst[:pivot_idx]
tmp_lst_valid = tmp_lst[pivot_idx:]
print(tmp_lst)
print(tmp_lst_train)
print(tmp_lst_valid)

In [None]:
# df.index.values
a_1 = np.arange(10)
a_2 = np.random.permutation(a_1)
a_2

In [None]:
a_1

In [None]:
a_1[:3]

In [None]:
df.shape

In [None]:
len(df.index)

In [None]:
df[["path", "data_split"]]

In [None]:
df.to_csv(path / "final.csv")

In [None]:
df_2