# Processing dataframe
> Mainly define event, 5-year- and 10 year-survival.

In [1]:
#| default_exp nb_01_processing

In [2]:
#| hide
from nbdev.showdoc import *

In [4]:
from ml.nb_00_preprocessing import pd

In [111]:
df0 = pd.read_pickle("data/df_all.pkl")
df0.shape

(1180, 69)

In [147]:
#| export
import re

In [357]:
#| export
def most_recent_date(df):
    """df must be Pandas dataframe with date columns only"""
    col = df.max(axis=1)
    col[col==pd.Timestamp("1800-01-01")] = pd.NaT
    return col

In [363]:
#| export
class ProcessDataframe:
    "Clean columns names, columns, define clinical parameter"
    def __init__(self,df): 
        self.df = df.copy()
    
    def clean_cols(self):
        self.df.columns = self.df.columns.str.replace(r'[/,., '  ',-]', '',regex=True)
        self.df.columns = self.df.columns.str.replace(r'%', 'percent',regex=True)
    
    def to_date(self, cols): 
        self.df[cols] = self.df[cols].apply(lambda col: col.str.extract(r'(\d{2}.\d{2}.\d{4})', expand=False))
        self.df[cols] = self.df[cols].apply(lambda x: pd.to_datetime(x, format="%d.%m.%Y"))

    def add_stimes(self, 
                   date_of_receit, 
                   last_contacts:list, # columns with updated followups    
                   recurrences:list, # columns with updated recurrences
                   deaths:list): # columns with updated deaths
    
        self.df["date_of_receit"] = self.df[date_of_receit]
        self.df["date_of_last_contact"] = most_recent_date(self.df[last_contacts])
        self.df["date_of_recurrence"] = most_recent_date(self.df[recurrences])
        self.df["date_of_death"] = most_recent_date(self.df[deaths])
        
        self.df["stime_recurrence"] = self.df["date_of_recurrence"]
        self.df.loc[pd.isnull(self.df["stime_recurrence"]), "stime_recurrence"] = self.df.loc[pd.isnull(self.df["stime_recurrence"]), "date_of_last_contact"] 
        self.df["stime_recurrence"] = self.df["stime_recurrence"] - self.df["date_of_receit"]
    
        self.df["stime_death"] = self.df["date_of_death"]
        self.df.loc[pd.isnull(self.df["stime_death"]), "stime_death"] = self.df.loc[pd.isnull(self.df["stime_death"]), "date_of_last_contact"] 
        self.df["stime_death"] = self.df["stime_death"] - self.df["date_of_receit"]
        
        self.df["stime"] = self.df["date_of_last_contact"] - self.df["date_of_receit"]
        self.df.loc[self.df["stime_death"]>self.df["stime"],"stime"] = self.df.loc[self.df["stime_death"]>self.df["stime"],"stime_death"]

In [364]:
PD = ProcessDataframe(df0)
PD.clean_cols()

In [365]:
date_cols = ['EDatum', 'Tod_Datum', 'letzteEE', 'EE2', 'EE3', 'Rezidivdatum_2011', 
        'Rezidivdatum2011_2', 'Rezidivdatum_2015', 'Rezidivdatum_2020']

In [366]:
PD.to_date(date_cols)

In [367]:
PD.df[date_cols]

Unnamed: 0,EDatum,Tod_Datum,letzteEE,EE2,EE3,Rezidivdatum_2011,Rezidivdatum2011_2,Rezidivdatum_2015,Rezidivdatum_2020
0,2004-11-21,NaT,2020-09-16,2015-12-07,2008-11-03,NaT,NaT,NaT,NaT
1,2003-08-05,NaT,2015-12-29,2008-11-03,NaT,NaT,NaT,NaT,NaT
2,1998-02-25,2019-12-15,2009-02-03,NaT,NaT,NaT,NaT,NaT,NaT
3,1999-07-15,NaT,2008-10-31,NaT,NaT,NaT,NaT,NaT,NaT
4,2001-10-11,NaT,2008-10-16,NaT,NaT,NaT,NaT,NaT,NaT
...,...,...,...,...,...,...,...,...,...
1175,2004-07-01,1800-01-01,NaT,NaT,NaT,NaT,NaT,NaT,NaT
1176,2004-08-11,NaT,2008-10-17,NaT,NaT,NaT,NaT,NaT,NaT
1177,2004-09-30,NaT,2015-12-09,2008-10-31,NaT,NaT,NaT,NaT,NaT
1178,2004-07-01,NaT,2008-12-03,NaT,NaT,NaT,NaT,NaT,NaT


In [368]:
date_of_receit = ["EDatum"]
last_contacts = ['letzteEE', 'EE2', 'EE3']
recurrences = ['Rezidivdatum_2011', 'Rezidivdatum2011_2', 'Rezidivdatum_2015', 'Rezidivdatum_2020']
deaths = ["Tod_Datum"]

In [370]:
PD.add_stimes(date_of_receit=date_of_receit,
              last_contacts=last_contacts,
              recurrences=recurrences,
              deaths=deaths)

In [371]:
PD.df[[c for c in tmp.columns if "stime" in c]]

Unnamed: 0,stime_recurrence,stime_death,stime
0,5778 days,5778 days,5778 days
1,4529 days,4529 days,4529 days
2,3996 days,7963 days,7963 days
3,3396 days,3396 days,3396 days
4,2562 days,2562 days,2562 days
...,...,...,...
1175,NaT,NaT,NaT
1176,1528 days,1528 days,1528 days
1177,4087 days,4087 days,4087 days
1178,1616 days,1616 days,1616 days


In [372]:
PD.df.columns

Index(['TMA_ID', 'TMASpot', 'fn_x', 'fn_y', 'Relevanz', 'AlternativeBelegung',
       'PatID', 'EDatum', 'Alter', 'G', 'T', 'grTumordurchmesser',
       'weitereTumordurchmesser', 'N', 'ER', 'PR', 'Her2neu', 'Her2neuScore',
       'Ki67', 'Ki67_percent', 'DegreeOfTubuleFormation',
       'NuclearPleomorphism', 'Mitosis', 'VanNuys', 'WHO', 'WHO_DCIS',
       'Tumorart', 'Probenart', 'Seite', 'Status', 'letzteEE', 'EE2', 'EE3',
       'Tod_Quelle', 'Tod_Datum', 'Rezidiv_jn', 'Metastase_jn',
       'Rezidivdatum_2011', 'Rezidivdatum2011_2', 'Rezidivdatum_2015',
       'Rezidivdatum_2020', 'Metastase_Lokalisation', 'AB_Massnahme',
       'AB_Radiatio', 'Radiatio_jn', 'AB_zytotox_Medikament',
       'AB_zytotox_Zyklen', 'AB_zytotox_Abbruch', 'Chemo_jn',
       'Chemo_Zeitpunkt', 'Chemo_Medikament', 'AB_endokr_Dauer',
       'AB_endokr_Medikament', 'AB_endokr_Abbruch', 'Antihormon_jn',
       'Tamoxifen_2020', 'Antihormon_2015', 'Tamoxifen_2015', 'Tamoxifen_2014',
       'Tamoxifen_Exprimage