# Processing dataframe
> Mainly define event, 5-year- and 10 year-survival.

In [1]:
#| default_exp nb_01_processing

In [2]:
#| hide
from nbdev.showdoc import *

In [373]:
#| export
from ml.nb_00_preprocessing import pd

In [111]:
df0 = pd.read_pickle("data/df_all.pkl")
df0.shape

(1180, 69)

In [147]:
#| export
import re

In [374]:
#| export
def most_recent_date(df):
    """df must be Pandas dataframe with date columns only"""
    col = df.max(axis=1)
    col[col==pd.Timestamp("1800-01-01")] = pd.NaT
    return col
def oldest_date(df):
    """df must be Pandas dataframe with date columns only"""
    col = df.min(axis=1)
    col[col==pd.Timestamp("1800-01-01")] = pd.NaT
    return col

In [976]:
#| export
class ProcessDataframe:
    "Clean columns names, columns, define clinical parameter"
    def __init__(self,df): 
        self.df = df.copy()
    
    def clean_cols(self):
        """Removes unreadable characters for Pandas"""
        self.df.columns = self.df.columns.str.replace(r'[/,., '  ',-]', '',regex=True)
        self.df.columns = self.df.columns.str.replace(r'%', 'percent',regex=True)
    
    def to_date(self, cols): 
        self.df[cols] = self.df[cols].apply(lambda col: col.str.extract(r'(\d{2}.\d{2}.\d{4})', expand=False))
        self.df[cols] = self.df[cols].apply(lambda x: pd.to_datetime(x, format="%d.%m.%Y"))

    def add_stimes(self, 
                   date_of_receit, 
                   last_contacts:list, # columns with updated followups    
                   recurrences:list, # columns with updated recurrences
                   deaths:list): # columns with updated deaths
        """Defines survival times based on recurrence or death"""
    
        self.df["date_of_receit"] = self.df[date_of_receit]
        self.df["date_of_last_contact"] = most_recent_date(self.df[last_contacts])
        self.df["date_of_recurrence"] = oldest_date(self.df[recurrences])
        self.df["date_of_death"] = oldest_date(self.df[deaths])
        
        self.df["stime_recurrence"] = self.df["date_of_recurrence"]
        self.df.loc[pd.isnull(self.df["stime_recurrence"]), "stime_recurrence"] = self.df.loc[pd.isnull(self.df["stime_recurrence"]), "date_of_last_contact"] 
        self.df["stime_recurrence"] = (self.df["stime_recurrence"] - self.df["date_of_receit"]).dt.days/365
        
        self.df["stime_metastasis"] = self.df["stime_recurrence"]
    
        self.df["stime_death"] = self.df["date_of_death"]
        self.df.loc[pd.isnull(self.df["stime_death"]), "stime_death"] = self.df.loc[pd.isnull(self.df["stime_death"]), "date_of_last_contact"] 
        self.df["stime_death"] = (self.df["stime_death"] - self.df["date_of_receit"]).dt.days/365
        
        self.df["stime"] = (self.df["date_of_last_contact"] - self.df["date_of_receit"]).dt.days/365
        self.df.loc[self.df["stime_death"]>self.df["stime"],"stime"] = self.df.loc[self.df["stime_death"]>self.df["stime"],"stime_death"]
        
        self.df["status_death"] = 0
        self.df.loc[~pd.isnull(self.df["date_of_death"]), "status_death"] = 1
      
    def _add_year_survival(self, 
                       event_name:str, #new survival event column name like event_5year 
                       time_name:str, #new survival time column name like stime_5year 
                       event:str, # actual column name with status info
                       time:str, # actual column name with survival time
                       s:int): # threshold for survival in years

        self.df[event_name] = self.df[event]
        self.df.loc[self.df[time]>=s, event_name] = 0
    
        self.df[time_name] = self.df[time]
        self.df.loc[self.df[time]>=s, time_name] = s
        
    def add_events(self, col_names:list, cols:list):
        for n,o in zip(col_names, cols):
            self.df[n] = o
            
    def add_year_survival(self, *pars): # parameters in `_add_year_survival` in same order
        pars = list(pars)
        for p in pars:
            self._add_year_survival(p[0],p[1],p[2],p[3],p[4])

In [977]:
PD = ProcessDataframe(df0)
PD.clean_cols()

In [978]:
date_cols = ['EDatum', 'Tod_Datum', 'letzteEE', 'EE2', 'EE3', 'Rezidivdatum_2011', 
        'Rezidivdatum2011_2', 'Rezidivdatum_2015', 'Rezidivdatum_2020']

In [979]:
PD.to_date(date_cols)

In [980]:
date_of_receit = ["EDatum"]
last_contacts = ['letzteEE', 'EE2', 'EE3']
recurrences = ['Rezidivdatum_2011', 'Rezidivdatum2011_2', 'Rezidivdatum_2015', 'Rezidivdatum_2020']
deaths = ["Tod_Datum"]

In [981]:
PD.add_stimes(date_of_receit=date_of_receit,
              last_contacts=last_contacts,
              recurrences=recurrences,
              deaths=deaths)

In [982]:
PD.add_events(["event_recurrence", "event_metastasis", "event_death"], 
              [PD.df["Rezidiv_jn"], PD.df["Rezidiv_jn"],  PD.df["status_death"]] )

In [983]:
PD.add_year_survival(["event_5y_recurrence", "stime_5y_recurrence", "event_recurrence", "stime_recurrence", 5],
                    ["event_10y_recurrence", "stime_10y_recurrence", "event_recurrence", "stime_recurrence", 10],
                    ["event_5y_death", "stime_5y_death", "event_death", "stime_death", 5],
                    ["event_10y_death", "stime_10y_death", "event_death", "stime_death", 10],
                    ["event_5y_metastasis", "stime_5y_metastasis", "event_metastasis", "stime_metastasis", 5],
                    ["event_10y_metastasis", "stime_10y_metastasis", "event_metastasis", "stime_metastasis", 10])

In [984]:
#PD.df.columns

In [985]:
PD.df[PD.df.columns[-10:]]

Unnamed: 0,event_10y_recurrence,stime_10y_recurrence,event_5y_death,stime_5y_death,event_10y_death,stime_10y_death,event_5y_metastasis,stime_5y_metastasis,event_10y_metastasis,stime_10y_metastasis
0,0,10.000000,0,5.000000,0,10.000000,0,5.000000,0,10.000000
1,0,10.000000,0,5.000000,0,10.000000,0,5.000000,0,10.000000
2,0,10.000000,0,5.000000,0,10.000000,0,5.000000,0,10.000000
3,0,9.304110,0,5.000000,0,9.304110,0,5.000000,0,9.304110
4,0,7.019178,0,5.000000,0,7.019178,0,5.000000,0,7.019178
...,...,...,...,...,...,...,...,...,...,...
1175,0,,0,,0,,0,,0,
1176,0,4.186301,0,4.186301,0,4.186301,0,4.186301,0,4.186301
1177,0,10.000000,0,5.000000,0,10.000000,0,5.000000,0,10.000000
1178,0,4.427397,0,4.427397,0,4.427397,0,4.427397,0,4.427397


In [404]:
#| export
import numpy as np

In [410]:
def add_year_survival(times = "stime_recurrence", events = "Rezidiv_jn"):
    
    for t,e in zip(times, events):
        print(t,e)

# df[col_time] = df[col_time]/np.timedelta64(1, 'Y')

# df.loc[df[col_time] >= 10 ,col_status] = 0
# df.loc[df[col_time] >= 10 ,col_time] = 10

In [567]:
def _add_year_survival(self, 
                       event_name:str, #new survival event column name like event_5year 
                       time_name:str, #new survival time column name like stime_5year 
                       time:str, # actual column name with survival time
                       event:str, # actual column name with status info
                       s:int): # threshold for survival in years
    df[time] = df[time]/np.timedelta64(1, 'Y')
    
    df[event_name] = df[event]
    df.loc[df[time]>=s, event_name] = 0
    
    df[time_name] = df[time]
    df.loc[df[time]>=s, time_name] = s 

In [564]:
_add_year_survival(PD.df)

In [565]:
PD.df[["event_5year", "stime_5year"]]

Unnamed: 0,event_5year,stime_5year
0,0,5 years
1,0,5 years
2,0,5 years
3,0,5 years
4,0,5 years
...,...,...
1175,0,
1176,0,4.183522
1177,0,5 years
1178,0,4.424458


In [None]:
#stime_recurrence_5years
#stime_recurrence_10years
#event_recurrence_5years
#event_recurrence_10years

#stime_death_5years
#stime_death_10years
#event_death_5years
#event_death_10years

#stime_recurrence_or_death_5years
#stime_recurrence_or_death_10years
#event_recurrence_or_death_5years
#event_recurrence_or_death_10years

In [None]:
PD = ProcessDataframe(df0)
PD.clean_cols()
date_cols = ['EDatum', 'Tod_Datum', 'letzteEE', 'EE2', 'EE3', 'Rezidivdatum_2011', 
                'Rezidivdatum2011_2', 'Rezidivdatum_2015', 'Rezidivdatum_2020']
PD.to_date(date_cols)

date_of_receit = ["EDatum"]
last_contacts = ['letzteEE', 'EE2', 'EE3']
recurrences = ['Rezidivdatum_2011', 'Rezidivdatum2011_2', 'Rezidivdatum_2015', 'Rezidivdatum_2020']
deaths = ["Tod_Datum"]

PD.add_stimes(date_of_receit=date_of_receit,
              last_contacts=last_contacts,
              recurrences=recurrences,
              deaths=deaths)