# Processing dataframe
> Mainly define event, 5-year- and 10 year-survival.

In [1]:
#| default_exp nb_01_processing

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
from ml.nb_00_preprocessing import pd

In [4]:
#| export
import numpy as np

In [5]:
df0 = pd.read_pickle("data/df_all.pkl")
df0.shape

(1180, 69)

In [6]:
#| export
import re

In [7]:
#| export
def most_recent_date(df):
    """df must be Pandas dataframe with date columns only"""
    col = df.max(axis=1)
    col[col==pd.Timestamp("1800-01-01")] = pd.NaT
    return col
def oldest_date(df):
    """df must be Pandas dataframe with date columns only"""
    col = df.min(axis=1)
    col[col==pd.Timestamp("1800-01-01")] = pd.NaT
    return col

In [8]:
#| export
class ProcessDataframe:
    "Clean columns names, columns, define clinical parameter"
    def __init__(self,df): 
        self.df = df.copy()
        self.date_cols = ['EDatum', 'Tod_Datum', 'letzteEE', 'EE2', 'EE3', 'Rezidivdatum_2011', 
                          'Rezidivdatum2011_2', 'Rezidivdatum_2015', 'Rezidivdatum_2020']
        
        self.date_of_receit = ["EDatum"]
        self.last_contacts = ['letzteEE', 'EE2', 'EE3']
        self.recurrences = ['Rezidivdatum_2011', 'Rezidivdatum2011_2', 'Rezidivdatum_2015', 'Rezidivdatum_2020']
        self.deaths = ["Tod_Datum"]
    
    def clean_cols(self):
        """Removes unreadable characters for Pandas"""
        self.df.columns = self.df.columns.str.replace(r'[/,., '  ',-]', '',regex=True)
        self.df.columns = self.df.columns.str.replace(r'%', 'percent',regex=True)
    
    def to_date(self, cols): 
        self.df[cols] = self.df[cols].apply(lambda col: col.str.extract(r'(\d{2}.\d{2}.\d{4})', expand=False))
        self.df[cols] = self.df[cols].apply(lambda x: pd.to_datetime(x, format="%d.%m.%Y"))

    def add_stimes(self, 
                   date_of_receit, 
                   last_contacts:list, # columns with updated followups    
                   recurrences:list, # columns with updated recurrences
                   deaths:list): # columns with updated deaths
        """Defines survival times based on recurrence or death"""
    
        self.df["date_of_receit"] = self.df[date_of_receit]
        self.df["date_of_last_contact"] = most_recent_date(self.df[last_contacts])
        self.df["date_of_recurrence"] = most_recent_date(self.df[recurrences])
        self.df["date_of_death"] = oldest_date(self.df[deaths])
        
        self.df["date_of_recurrence"] = self.df["date_of_recurrence"]
        self.df.loc[pd.isnull(self.df["date_of_recurrence"]), "date_of_recurrence"] = self.df.loc[pd.isnull(self.df["date_of_recurrence"]), "date_of_last_contact"] 
        self.df["stime_recurrence"] = np.abs((self.df["date_of_recurrence"] - self.df["date_of_receit"]).dt.days/365)
        
        self.df["stime_metastasis"] = self.df["stime_recurrence"]
    
        self.df["stime_death"] = self.df["date_of_death"]
        self.df.loc[pd.isnull(self.df["stime_death"]), "stime_death"] = self.df.loc[pd.isnull(self.df["stime_death"]), "date_of_last_contact"] 
        self.df["stime_death"] = np.abs((self.df["stime_death"] - self.df["date_of_receit"]).dt.days/365)
        
        self.df["stime"] = (self.df["date_of_last_contact"] - self.df["date_of_receit"]).dt.days/365
        self.df.loc[self.df["stime_death"]>self.df["stime"],"stime"] = self.df.loc[self.df["stime_death"]>self.df["stime"],"stime_death"]
        
        self.df["status_death"] = 0
        self.df.loc[~pd.isnull(self.df["date_of_death"]), "status_death"] = 1
      
    def _add_year_survival(self, 
                       event_name:str, #new survival event column name like event_5year 
                       time_name:str, #new survival time column name like stime_5year 
                       event:str, # actual column name with status info
                       time:str, # actual column name with survival time
                       s:int): # threshold for survival in years

        self.df[event_name] = self.df[event]
        self.df.loc[self.df[time]>=s, event_name] = 0
    
        self.df[time_name] = self.df[time]
        self.df.loc[self.df[time]>=s, time_name] = s
        
    def add_cols(self, col_names:list, cols:list):
        for n,o in zip(col_names, cols):
            self.df[n] = o
            
    def add_year_survival(self, *pars): # parameters in `_add_year_survival` in same order
        pars = list(pars)
        for p in pars:
            self._add_year_survival(p[0],p[1],p[2],p[3],p[4])
            
    def process_data_in_one_step(self):
        
        self.clean_cols()
        self.to_date(self.date_cols)
        self.add_stimes(date_of_receit=self.date_of_receit,
                        last_contacts=self.last_contacts,
                        recurrences=self.recurrences,
                        deaths=self.deaths)
        self.add_cols(["event_recurrence", "event_metastasis", "event_death"], 
                        [self.df["Rezidiv_jn"], self.df["Metastase_jn"],  self.df["status_death"]] )
        self.add_cols(["event_dom"], # dor: death or metastasis
                        [(self.df.event_death.astype("int")) | (self.df.event_metastasis.astype("int"))]
                       )
        
        self.df["stime_dom"] = self.df["stime_metastasis"]  
        self.df.loc[self.df["event_death"]==1 ,"stime_dom"] = self.df.loc[self.df["event_death"]==1 ,"stime_death"]
        self.df.loc[self.df["event_metastasis"]==1 ,"stime_dom"] = self.df.loc[self.df["event_metastasis"]==1 ,"stime_metastasis"]
        
        self.add_year_survival(
                    ["event_5y_recurrence", "stime_5y_recurrence", "event_recurrence", "stime_recurrence", 5],
                    ["event_10y_recurrence", "stime_10y_recurrence", "event_recurrence", "stime_recurrence", 10],
                    ["event_5y_death", "stime_5y_death", "event_death", "stime_death", 5],
                    ["event_10y_death", "stime_10y_death", "event_death", "stime_death", 10],
                    ["event_5y_dom", "stime_5y_dom", "event_dom", "stime_dom", 5],
                    ["event_10y_dom", "stime_10y_dom", "event_dom", "stime_dom", 10]
        )
        print("Process done!")

In [9]:
PD = ProcessDataframe(df0)

In [10]:
PD.process_data_in_one_step()

Process done!


In [11]:
PD.df.columns

Index(['TMA_ID', 'TMASpot', 'fn_x', 'fn_y', 'Relevanz', 'AlternativeBelegung',
       'PatID', 'EDatum', 'Alter', 'G', 'T', 'grTumordurchmesser',
       'weitereTumordurchmesser', 'N', 'ER', 'PR', 'Her2neu', 'Her2neuScore',
       'Ki67', 'Ki67_percent', 'DegreeOfTubuleFormation',
       'NuclearPleomorphism', 'Mitosis', 'VanNuys', 'WHO', 'WHO_DCIS',
       'Tumorart', 'Probenart', 'Seite', 'Status', 'letzteEE', 'EE2', 'EE3',
       'Tod_Quelle', 'Tod_Datum', 'Rezidiv_jn', 'Metastase_jn',
       'Rezidivdatum_2011', 'Rezidivdatum2011_2', 'Rezidivdatum_2015',
       'Rezidivdatum_2020', 'Metastase_Lokalisation', 'AB_Massnahme',
       'AB_Radiatio', 'Radiatio_jn', 'AB_zytotox_Medikament',
       'AB_zytotox_Zyklen', 'AB_zytotox_Abbruch', 'Chemo_jn',
       'Chemo_Zeitpunkt', 'Chemo_Medikament', 'AB_endokr_Dauer',
       'AB_endokr_Medikament', 'AB_endokr_Abbruch', 'Antihormon_jn',
       'Tamoxifen_2020', 'Antihormon_2015', 'Tamoxifen_2015', 'Tamoxifen_2014',
       'Tamoxifen_Exprimage

In [12]:
df = PD.df

In [13]:
df[[c for c in df.columns if "event" in c]].apply(pd.value_counts)

Unnamed: 0,event_recurrence,event_metastasis,event_death,event_dom,event_5y_recurrence,event_10y_recurrence,event_5y_death,event_10y_death,event_5y_dom,event_10y_dom
0.0,1014,1108,1072,1014,1117,1075,1138,1105,1112,1067
1.0,166,72,108,166,63,105,42,75,68,113


In [14]:
((df.event_recurrence+df.event_dom)==2).sum()

79

In [15]:
df.event_dom.value_counts()[1]/ df.shape[0]

0.14067796610169492

In [16]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [17]:
l = flatten([[df.columns[7]] , ['letzteEE', 'EE2', 'EE3',
       'Tod_Datum', 'Rezidiv_jn', 'Metastase_jn',
       'Rezidivdatum_2011', 'Rezidivdatum2011_2', 'Rezidivdatum_2015',
       'Rezidivdatum_2020']])

In [18]:
df[l][df.date_of_recurrence < df.date_of_receit]

Unnamed: 0,EDatum,letzteEE,EE2,EE3,Tod_Datum,Rezidiv_jn,Metastase_jn,Rezidivdatum_2011,Rezidivdatum2011_2,Rezidivdatum_2015,Rezidivdatum_2020
280,2002-12-12,2015-12-07,2008-10-31,NaT,NaT,1,1.0,NaT,NaT,2002-11-01,NaT
326,2006-08-24,2020-10-05,2016-01-28,2009-02-16,NaT,1,0.0,NaT,NaT,2006-08-01,NaT
345,2001-12-06,2020-09-16,2015-12-18,NaT,NaT,1,0.0,NaT,NaT,2001-03-01,NaT
351,2008-08-05,2020-09-16,2015-12-11,2011-06-03,NaT,1,0.0,1992-06-01,NaT,2008-06-01,NaT
379,2004-05-21,2020-09-14,2015-12-08,2008-09-23,NaT,1,0.0,NaT,NaT,NaT,2004-05-01
424,2009-06-26,2015-12-13,2011-07-04,NaT,NaT,1,0.0,2003-06-01,NaT,2009-06-01,NaT
458,2010-06-03,2020-09-25,2015-12-16,2011-06-27,NaT,1,0.0,2010-04-15,NaT,2010-04-15,NaT
482,2011-04-08,2015-12-15,2011-06-24,NaT,NaT,1,1.0,2011-03-15,NaT,2011-03-01,NaT
489,2011-02-07,2020-11-06,2015-12-08,NaT,NaT,1,0.0,NaT,NaT,NaT,2011-02-01
570,2011-03-18,2015-11-25,2008-10-14,NaT,2019-08-15,1,1.0,NaT,NaT,2011-02-01,NaT


## Save data

In [19]:
#| hide
df.to_pickle("data/df_all_2022_10_06.pkl")

In [20]:
#| hide
import nbdev; nbdev.nbdev_export()