## Data Processes: Second Assignment (first part)

### Grupo:
Víctor Morcuende Castell, 47315589N

Guillermo Nájera Lavid, 70845359T

Javier Rocamora García, 20081979N

Antonio Ruiz García, 06601574E

In [47]:
import pandas as pd
from sklearn import preprocessing
from collections import defaultdict
from sklearn.impute import SimpleImputer
import math
import numpy as np

In [48]:
# Auxiliary functions

def diagnoseDate_to_ageDiagnosed(birthDate, diagnoseDate):
    return pd.to_datetime(diagnoseDate).year-pd.to_datetime(birthDate).year

def deathDate_to_survivalTime(diagnosisDate, deathDate):
    survivalTime = pd.to_datetime(deathDate).year-pd.to_datetime(diagnosisDate).year
    if math.isnan(survivalTime) or survivalTime < 0:
        survivalTime = 1000
    return survivalTime

def deathDate_to_survived(deathDate):
    survived = pd.to_datetime(deathDate).year
    if math.isnan(survived):
        survived = 1
    else: 
        survived = 0
    return survived

def deathDate_to_age(birthDate, deathDate):
    deathAge = pd.to_datetime(deathDate).year-pd.to_datetime(birthDate).year
    if math.isnan(deathAge):
        deathAge = 0
    return deathAge

def recurrence_year_to_recurrence_time(diagnosisDate, recurrence_year):
    recurrenceTime = recurrence_year-pd.to_datetime(diagnosisDate).year
    if math.isnan(recurrenceTime) or recurrenceTime < 0:
        recurrenceTime = 1000
    return recurrenceTime

def fix_pregnancy(pregnancies, abortions, births):
    if pregnancies < (births + abortions):
        pregnancies = pregnancies+1
    return pregnancies

def preprocess_t(x):
    new_t = x.t
    if pd.isnull(new_t):
        if not(pd.isnull(x.t_after_neoadj)):
            new_t = x.t_after_neoadj
        else:
            new_t = "unknown"
    if not(isinstance(new_t, str)):
        int(new_t)
    return str(new_t)

def preprocess_n(x):
    new_n = x.n
    if pd.isnull(new_n):
        if not(pd.isnull(x.n_after_neoadj)):
            new_n = x.n_after_neoadj
        else:
            new_n = "unknown"
    if not(isinstance(new_n, str)):
        int(new_n)
    return str(new_n)

def preprocess_m(x):
    new_m = x.m
    if pd.isnull(new_m):
        if not(pd.isnull(x.m_after_neoadj)):
            new_m = x.m_after_neoadj
        else:
            new_m = "unknown"
    if not(isinstance(new_m, str)):
        int(new_m)
    return str(new_m)


def fill_t_after_neoadj(x):
    new_t = x.t_after_neoadj
    if pd.isnull(new_t):
            new_t = x.t
    if not(isinstance(new_t, str)):
        int(new_t)
    return str(new_t)

def fill_n_after_neoadj(x):
    new_n = x.n_after_neoadj
    if pd.isnull(new_n):
            new_n = x.n
    if not(isinstance(new_n, str)):
        int(new_n)
    return str(new_n)

def fill_m_after_neoadj(x):
    new_m = x.m_after_neoadj
    if pd.isnull(new_m):
            new_m = x.m
    if not(isinstance(new_m, str)):
        int(new_m)
    return str(new_m)

### Preprocessing of Breast Cancer datasets

In [49]:
# Deleting duplicated data and unused column
df1 = pd.read_excel("breast_cancer_data.xlsx")
df1 = df1.drop_duplicates(subset=['ehr'], keep='first')
df1 = df1.set_index('ehr')
df2 = pd.read_excel("breast_cancer_data_2.xlsx")
df2 = df2.drop_duplicates(subset=['ehr'], keep='first')
df2 = df2.set_index('ehr')
df = pd.concat([df1, df2], axis=0)
df.pop('Unnamed: 0')
    
# Duplicating the DataFrame in order to obtain the numerical variables
df_num = pd.DataFrame(data=df, columns=df.columns, index=df.index)
df_num.pop('side')
df_num.pop('neoadjuvant')
df_num.pop('grade')
df_num.pop('invasive')
df_num.pop('er_positive')
df_num.pop('pr_positive')
df_num.pop('her2_positive')
df_num.pop('hist_type')

# Dividing the DataFrame into categorical and numerical variables
num_cols = df_num.columns.tolist()
df_cat = df.drop(num_cols, axis=1)

We delete the NULL values of the categorical variables by using the Simple Imputer

In [50]:
df_cat.side = df_cat.side.apply(lambda x: 'unknown' if (x != 'left' and x != 'right') else x)
df_cat.invasive = df_cat.invasive.apply(lambda x: 0 if x != 1 else x)

# Imputation of nulls in categorical columns using Simple Imputer
imp_cat = SimpleImputer(strategy='most_frequent')
columns = df_cat.columns
index = df_cat.index
df_cat = pd.DataFrame(imp_cat.fit_transform(df_cat), columns=columns, index=index)

# Transforming categorical values into numerical variables
df_cat.neoadjuvant = df_cat.neoadjuvant.apply(lambda x: 0.0 if x == 'no' else 1.0)

Now we are about to subsitute the categorical labels that are represented by strings with numerical values, in order to avoid working with Strings.

In [51]:
# Taking some variables out as they are already converted into numerical values
df_aux = pd.DataFrame(data=df_cat, columns=df_cat.columns, index=df_cat.index)
df_cat.pop('neoadjuvant')
df_cat.pop('invasive')
df_cat.pop('er_positive')
df_cat.pop('pr_positive')
df_cat.pop('her2_positive')
num_cols = df_cat.columns.tolist()
df_aux = df_aux.drop(num_cols, axis=1)

# Using OneHotEncoder
ohe = preprocessing.OneHotEncoder(sparse=False)
df_cat_ohe = pd.DataFrame(ohe.fit_transform(df_cat), 
                          columns=ohe.get_feature_names_out(df_cat.columns.tolist()),
                          index=df_cat.index)

# Merge both DataFrames (df_cat_ohe and df_aux)
df_cat_def = pd.merge(left=df_cat_ohe, right=df_aux, on='ehr')

We will swap birth_date, diagnosis_date and death_date with the age at which the patient was diagnosed, their age at the time of death if so, and how long they survived in this case. This will be of way more use to us when trying to use this data for any predictions

In [52]:
# Age at which the patient was diagnosed
ageDiagnosed = pd.Series(df_num.apply(lambda x: diagnoseDate_to_ageDiagnosed(x.birth_date, x.diagnosis_date), axis=1), name='age_diagnosed')

# Time of survival since diagnosis, 1000 in case of full recovery
survivalTime = pd.Series(df_num.apply(lambda x: deathDate_to_survivalTime(x.diagnosis_date, x.death_date), axis=1), name='survival_time')

# We set "survived" column to be the target variable
class_col = pd.Series(df_num.apply(lambda x: deathDate_to_survived(x.death_date), axis=1), name='survived')

# Recurrence time for a patient
recurrenceTime = pd.Series(df_num.apply(lambda x: recurrence_year_to_recurrence_time(x.diagnosis_date, x.recurrence_year), axis=1), name='recurrence_time')

# Changing variables
df_num.pop('birth_date')
df_num.pop('diagnosis_date')
df_num.pop('death_date')
df_num.pop('recurrence_year')
df_num = pd.merge(left=df_num, right=ageDiagnosed, on='ehr')
df_num = pd.merge(left=df_num, right=survivalTime, on='ehr')
df_num = pd.merge(left=df_num, right=recurrenceTime, on='ehr')

We will treat differently the pregnancy, abort, birth and caesarean labels, which should not be filled with data computed from the mean of the rest of the data

In [53]:
df_num.pregnancy = df_num.pregnancy.apply(lambda x: 0 if math.isnan(x) else x)
df_num.abort = df_num.abort.apply(lambda x: 0 if math.isnan(x) else x)
df_num.birth = df_num.birth.apply(lambda x: 0 if x < 0 else x)
df_num.caesarean = df_num.caesarean.apply(lambda x: 0 if math.isnan(x) else x)

Since we saw some cases in which number of pregnancies, number of aborts and number of births dont compute, meaning that there may have been less pregnancies than supposed due to contradictory data, we decided to increment the number of pregnancies for those cases that do not add up.

In [54]:
df_num.pregnancy = df_num.apply(lambda x: fix_pregnancy(x.pregnancy, x.abort, x.birth), axis=1)

We delete the NULL values of the numerical variables by using the Simple Imputer

In [55]:
# Imputation of nulls in numerical columns using Simple Imputer
imp_num = SimpleImputer(strategy='mean')
columns = df_num.columns
index = df_num.index
df_num_def = pd.DataFrame(imp_num.fit_transform(df_num), columns=columns, index=index)

# We round up the menarche_age and menopause_age columns to give it sense
df_num_def.menarche_age = df_num_def.menarche_age.apply(np.ceil)
df_num_def.menopause_age = df_num_def.menopause_age.apply(np.ceil)

Now that all variables are numerical and do not have missing values, we can merge the categorical and numerical variables

In [56]:
df_preprocessed = pd.merge(left=df_cat_def, right=df_num_def, on='ehr')
df_preprocessed.to_csv("output1.csv")

### Preprocessing of TNM datasets

We start by eliminating duplicated data, since we do not think that having information about these few cases with multi tumoral cancers will give us any edge in the training process

In [57]:
# Deleting duplicated data
df3 = pd.read_csv("breast_cancer_data_tnm.csv")
df3 = df3.drop_duplicates(subset=['ehr'], keep='last')
df3 = df3.set_index('ehr')
df4 = pd.read_csv("breast_cancer_data_tnm_2.csv")
df4 = df4.drop_duplicates(subset=['ehr'], keep='last')
df4 = df4.set_index('ehr')
df_tnm = pd.concat([df3, df4], axis=0)

t_after_neoadjuvant, n_after_adjuvant and m_after_neoadjuvant columns will be completed with data from the t, n and m columns for those cases in which there have not been any neoadjuvant treatments.

In [58]:
# Preprocess of t, n and m column, changing IS and X labels and filling with t_after_neoadj in case there is no data
df_tnm.t = df_tnm.apply(lambda x: preprocess_t(x), axis=1)
df_tnm.n = df_tnm.apply(lambda x: preprocess_n(x), axis=1)
df_tnm.m = df_tnm.apply(lambda x: preprocess_m(x), axis=1)
df_tnm.astype('int16').dtypes

# Fill t_after_neoadj, n_after_neoadj and m_after_neoadj column with t column in case there is no data
df_tnm.t_after_neoadj = df_tnm.apply(lambda x: fill_t_after_neoadj(x), axis=1)
df_tnm.n_after_neoadj = df_tnm.apply(lambda x: fill_n_after_neoadj(x), axis=1)
df_tnm.m_after_neoadj = df_tnm.apply(lambda x: fill_m_after_neoadj(x), axis=1)

IntCastingNaNError: Cannot convert non-finite values (NA or inf) to integer

In [None]:
# Imputation of nulls using Simple Imputer
imp_tnm = SimpleImputer(strategy='most_frequent')
columns = df_tnm.columns
index = df_tnm.index
df_tnm_def = pd.DataFrame(imp_tnm.fit_transform(df_tnm), columns=columns, index=index)
df_tnm_def.to_csv("output2.csv")

We convert the columns from tnm dataset into categorical ones after preprocessing them

In [None]:
# Using OneHotEncoder
df_tnm_def_ohe = pd.DataFrame(ohe.fit_transform(df_tnm_def), 
                          columns=ohe.get_feature_names_out(df_tnm_def.columns.tolist()),
                          index=df_tnm_def.index)

In [None]:
#df_preprocessed_def = df_preprocessed.join(other=df_tnm_def)
df_preprocessed_def = pd.merge(left=df_preprocessed, right=df_tnm_def_ohe, how='outer', on='ehr')
df_preprocessed_def.to_excel("output_def.xlsx")