# Preprocessing

> Prepcocess data: load histo and mask images as well as tabular clinical data from directories to generate one data frame.

In [9]:
#| default_exp nb_00_preprocessing

In [10]:
#| hide
from nbdev.showdoc import *

In [11]:
#| export
from pathlib import Path
import os
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np

Path.ls = lambda x: list(x.iterdir())

## Images

In [4]:
path_x = Path("/media/dimi/TOSHIBA EXT/Dimitrij Original Images")
path_y = Path("/media/dimi/TOSHIBA EXT/Dimitrij Shape Masks")

In [5]:
#| export
def _get_files(p, fs, extensions=None):
    p = Path(p)
    res = [p/f for f in fs if not f.startswith('.')
           and ((not extensions) or f'.{f.split(".")[-1].lower()}' in extensions)]
    return res

def get_files(path, extensions=None, recurse=False, include=None):
    "Source: fast.ai, Jeremy Howard"
    path = Path(path)
    extensions = {e.lower() for e in extensions}
    if recurse:
        res = []
        for i,(p,d,f) in enumerate(os.walk(path)): # returns (dirpath, dirnames, filenames)
            if include is not None and i==0: d[:] = [o for o in d if o in include]
            else:                            d[:] = [o for o in d if not o.startswith('.')]
            res += _get_files(p, f, extensions)
        return res
    else:
        f = [o.name for o in os.scandir(path) if o.is_file()]
        return _get_files(path, f, extensions)

In [6]:
fn_x = get_files(path_x, recurse=True, extensions=[".tif"])
fn_y = get_files(path_y, recurse=True, extensions=[".tif"])
len(fn_x), len(fn_y)

(1316, 1229)

## Clinical data

In [7]:
path_c = Path("/media/dimi/TOSHIBA EXT/Dimitrij Single Cores/allTXT")

In [8]:
fn_c = get_files(path_c, recurse=False, extensions=[".txt"])

In [9]:
#| export 
import pandas as pd

## One clinical dataframe

In [10]:
#|export
def get_clinical_data(p):
    "p: path to directory with files"
    dfs = []
    for fn in p:
        df = pd.read_csv(fn, sep="\t", encoding = "ISO-8859-1", engine="python")
        df = df[df.columns.drop(list(df.filter(regex='Unnamed')))]
        df.insert(0, 'TMA_ID', fn.stem)
        dfs.append(df)
    return pd.concat(dfs)

In [11]:
df_c = get_clinical_data(fn_c)

In [12]:
df_c.shape

(3133, 66)

## One images dataframe

In [13]:
#| export
def _get_tma_id(fn):
    """fn: pathlib path to file"""
    tma = fn.parent.name.split('_')[0]
    return tma[:-2] + tma[-2:].lstrip('0')

def get_tma_id(fns):
    """fns: list of pathlib paths"""
    return list(map(_get_tma_id, fns))

In [14]:
#| export
def _get_tma_spot(fn):
    """fn: pathlib path to file"""
    f = fn.stem[-1]+fn.stem[:-2]
    return f if len(f)==3 else f[0]+'0'+f[1] 

def get_tma_spot(fns):
    """fns: list of pathlib paths"""
    return list(map(_get_tma_spot, fns))

In [15]:
df_x = pd.DataFrame({'TMA_ID': get_tma_id(fn_x), 
                     'TMA-Spot':get_tma_spot(fn_x), 
                     'fn_x':fn_x})

df_y = pd.DataFrame({'TMA_ID': get_tma_id(fn_y), 
                     'TMA-Spot':get_tma_spot(fn_y), 
                     'fn_x':fn_y})

In [16]:
df_x.shape

(1316, 3)

In [17]:
df_y.shape

(1229, 3)

In [18]:
df_xy = pd.merge(df_x, df_y, on=["TMA_ID", "TMA-Spot"])

In [19]:
df_xy.shape

(1229, 4)

In [20]:
df_all = pd.merge(df_xy, df_c, on=["TMA_ID", "TMA-Spot"])

In [21]:
df_all.shape

(1180, 68)

## All together

In [26]:
path_x = Path("/media/dimi/TOSHIBA EXT/Dimitrij Original Images")
path_y = Path("/media/dimi/TOSHIBA EXT/Dimitrij Shape Masks")
path_c = Path("/media/dimi/TOSHIBA EXT/Dimitrij Single Cores/allTXT")

In [27]:
fn_x = get_files(path_x, recurse=True, extensions=[".tif"])
fn_y = get_files(path_y, recurse=True, extensions=[".tif"])
fn_c = get_files(path_c, recurse=False, extensions=[".txt"])

In [28]:
df_x = pd.DataFrame({'TMA_ID': get_tma_id(fn_x), 
                     'TMA-Spot':get_tma_spot(fn_x), 
                     'fn_x':fn_x})

df_y = pd.DataFrame({'TMA_ID': get_tma_id(fn_y), 
                     'TMA-Spot':get_tma_spot(fn_y), 
                     'fn_y':fn_y})

df_c = get_clinical_data(fn_c)

df_xy = pd.merge(df_x, df_y, on=["TMA_ID", "TMA-Spot"])
df_all = pd.merge(df_xy, df_c, on=["TMA_ID", "TMA-Spot"])

In [29]:
np.unique(df_all.TMA_ID)

array(['TMA1', 'TMA2', 'TMA25', 'TMA26', 'TMA27', 'TMA28', 'TMA29',
       'TMA3', 'TMA5', 'TMA6', 'TMA7', 'TMA8'], dtype=object)

In [30]:
df_all["Type"] = 'NaN'

In [31]:
df_all.loc[df_all.TMA_ID=='TMA1', "Type"] = "Greyzone"
df_all.loc[df_all.TMA_ID=='TMA2', "Type"] = "Greyzone"
df_all.loc[df_all.TMA_ID=='TMA3', "Type"] = "Greyzone"

df_all.loc[df_all.TMA_ID=='TMA5', "Type"] = "Cohort"
df_all.loc[df_all.TMA_ID=='TMA6', "Type"] = "Cohort"
df_all.loc[df_all.TMA_ID=='TMA7', "Type"] = "Cohort"
df_all.loc[df_all.TMA_ID=='TMA8', "Type"] = "Cohort"

df_all.loc[df_all.TMA_ID=='TMA25', "Type"] = "TAM-only"
df_all.loc[df_all.TMA_ID=='TMA26', "Type"] = "TAM-only"
df_all.loc[df_all.TMA_ID=='TMA27', "Type"] = "TAM-only"
df_all.loc[df_all.TMA_ID=='TMA28', "Type"] = "TAM-only"
df_all.loc[df_all.TMA_ID=='TMA29', "Type"] = "TAM-only"

In [32]:
df_all.Type.unique()

array(['Greyzone', 'Cohort', 'TAM-only'], dtype=object)

## Save data frame

In [34]:
df_all.to_pickle("data/df_all.pkl")

In [24]:
#| hide
import nbdev; nbdev.nbdev_export()