In [390]:
import pandas as pd
from sklearn import preprocessing
from collections import defaultdict
from sklearn.impute import SimpleImputer
import math

In [391]:
# Auxiliary functions

def diagnoseDate_to_ageDiagnosed(birthDate, diagnoseDate):
    age = pd.to_datetime(diagnoseDate).year-pd.to_datetime(birthDate).year
    return age

def deathDate_to_age(birthDate, deathDate):
    deathAge = pd.to_datetime(deathDate).year-pd.to_datetime(birthDate).year
    if math.isnan(deathAge):
        deathAge = 0
    return deathAge

In [392]:
df = pd.read_excel("breast_cancer_data.xlsx", index_col='ehr')

# Deleting unused column
df.pop('Unnamed: 0')

# Duplicating the DataFrame in order to obtain the numerical variables
df_num = pd.DataFrame(data=df, columns=df.columns, index=df.index)
df_num.pop('side')
df_num.pop('neoadjuvant')
df_num.pop('grade')
df_num.pop('invasive')
df_num.pop('er_positive')
df_num.pop('pr_positive')
df_num.pop('her2_positive')
df_num.pop('hist_type')

# Dividing the DataFrame into categorical and numerical variables
num_cols = df_num.columns.tolist()
df_cat = df.drop(num_cols, axis=1)
for x in df_cat.invasive:
    print(x)

1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
nan
1.0
1.0
nan
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
nan
1.0
1.0
nan
1.0
1.0
1.0
1.0
1.0
1.0
nan
1.0
1.0
1.0
1.0
1.0
nan
nan
1.0
nan
nan
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
nan
1.0
1.0
1.0
1.0
nan
1.0
1.0
1.0
1.0
1.0
nan
1.0
1.0
1.0
1.0
nan
1.0
1.0
1.0
1.0
nan
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
nan
1.0
1.0
1.0
1.0
nan
1.0
1.0
nan
1.0
1.0
nan
1.0
nan
1.0
1.0
1.0
1.0
1.0
1.0
1.0
nan
1.0
1.0
1.0
1.0
1.0
1.0
1.0
nan
1.0
nan
1.0
1.0
1.0
1.0
1.0
1.0
1.0
nan
nan
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
nan
1.0
1.0
1.0
nan
1.0
1.0
nan
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


We delete the NULL values by using the Simple Imputer

In [393]:
df_cat.side = df.side.apply(lambda x: 'unknown' if (x != 'left' and x != 'right') else x)
df_cat.invasive = df.invasive.apply(lambda x: 1 if x == 1 else 0)
print(df_cat.isnull().any())
# for x in df_cat.invasive:
#     print(x)

# Imputation of nulls in categorical columns using Simple Imputer
imp_cat = SimpleImputer(strategy='most_frequent')
columns = df_cat.columns
index = df_cat.index
df_cat = pd.DataFrame(imp_cat.fit_transform(df_cat), columns=columns, index=index)

# Transforming categorical values into numerical variables
df_cat.neoadjuvant = df.neoadjuvant.apply(lambda x: 0 if x == 'no' else 1)


side             False
neoadjuvant       True
grade             True
invasive         False
er_positive       True
pr_positive       True
her2_positive     True
hist_type        False
dtype: bool


Now we are about to subsitute the categorical labels that are represented by strings with numerical values, in order to avoid working with Strings.

In [394]:
# Taking 'neoadjuvant' variable out as it is already converted into numerical values
df_aux = pd.DataFrame(data=df_cat, columns=df_cat.columns, index=df_cat.index)
df_aux.pop('side')
df_aux.pop('hist_type')

df_cat.pop('neoadjuvant')

# Using OneHotEncoder
ohe = preprocessing.OneHotEncoder(sparse=False)
df_cat_ohe = pd.DataFrame(ohe.fit_transform(df_cat), 
                          columns=ohe.get_feature_names(df_cat.columns.tolist()),
                          index=df_cat.index)

# Merge both DataFrames (df_cat_ohe and df_aux)
df_prepro = pd.merge(left=df_cat_ohe, right=df_aux, on='ehr')
print(df_prepro)

      side_left  side_right  side_unknown  grade_1.0  grade_2.0  grade_3.0  \
ehr                                                                          
6849        0.0         0.0           1.0        1.0        0.0        0.0   
268         0.0         0.0           1.0        0.0        1.0        0.0   
268         0.0         0.0           1.0        0.0        1.0        0.0   
268         0.0         0.0           1.0        0.0        1.0        0.0   
268         0.0         0.0           1.0        0.0        1.0        0.0   
...         ...         ...           ...        ...        ...        ...   
191         0.0         0.0           1.0        0.0        1.0        0.0   
6482        0.0         0.0           1.0        0.0        1.0        0.0   
2564        0.0         0.0           1.0        0.0        0.0        1.0   
2730        0.0         0.0           1.0        0.0        1.0        0.0   
2376        0.0         0.0           1.0        1.0        0.0 



We change the value of the birth_date column to the age of each patient by applying the above declared function to the dataframe, this way we will be able to know the age of each patient at the time they were diagnosed

In [395]:
#Age at which the patient was diagnosed
pollaman = df_num.apply(lambda x: deathDate_to_age(x.birth_date, x.diagnosis_date), axis=1)
#Age at which the patient died, if so
pollaman2 = df_num.apply(lambda x: deathDate_to_age(x.birth_date, x.death_date), axis=1)
#Time of survival since diagnosis, 100 in case of full recovery




Now we are going to eliminate the labels that do not give relevant information but would increase the computational cost of our data treatments.
e.g: Columns with too many different values, such as descriptions or names.