In [9]:
import pandas as pd
from sklearn import preprocessing
from collections import defaultdict
from sklearn.impute import SimpleImputer
import math

In [10]:
# Auxiliary functions

def birthDate_to_age(date):
    age = pd.to_datetime('2022-12-18').year-pd.to_datetime(date).year
    return age

def diagnosisDate_to_age(date):
    age = pd.to_datetime('2022-12-18').year-pd.to_datetime(date).year
    return age

def deathDate_to_age(birthDate, deathDate):
    deathAge = pd.to_datetime(deathDate).year-pd.to_datetime(birthDate).year
    if math.isnan(deathAge):
        deathAge = 0
    return deathAge

In [11]:
df = pd.read_excel("breast_cancer_data.xlsx", index_col='ehr')
df.pop('Unnamed: 0')

# Categorical boolean mask
cat_mask = (df.dtypes==object)

# Filter categorical columns using mask and turn it into a list
cat_cols = df.columns[cat_mask].tolist()

# Exclude date columns as they are not categorical values
cat_cols.remove('birth_date')
cat_cols.remove('diagnosis_date')
cat_cols.remove('death_date')

# Dividing the DataFrame into categorical and numerical columns
df_cat = df[cat_cols]
df_num = df.drop(cat_cols, axis=1)

We delete the NULL values by using the Simple Imputer

In [26]:
# Imputation of nulls in categorical columns
imp_cat = SimpleImputer(strategy='most_frequent')
columns = df_cat.columns
index = df_cat.index
df_cat = pd.DataFrame(imp_cat.fit_transform(df_cat), columns=columns, index=index)

# Transforming categorical values into numerical variables
df_cat.neoadjuvant = df.neoadjuvant.apply(lambda x: 0 if x == 'no' else 1)

Now we are about to subsitute the categorical labels that are represented by strings with numerical values, in order to avoid working with Strings.

In [27]:
# Using OneHotEncoder
ohe = preprocessing.OneHotEncoder(sparse=False)
df_cat_ohe = pd.DataFrame(ohe.fit_transform(df_cat), 
                          columns=ohe.get_feature_names(df_cat.columns.tolist()),
                          index=df_cat.index)
print(df_cat_ohe)

      side_left  side_right  neoadjuvant_0  neoadjuvant_1  hist_type_ductal  \
ehr                                                                           
6849        0.0         1.0            1.0            0.0               1.0   
268         0.0         1.0            1.0            0.0               0.0   
1458        0.0         1.0            1.0            0.0               1.0   
268         0.0         1.0            1.0            0.0               0.0   
2013        0.0         1.0            0.0            1.0               1.0   
...         ...         ...            ...            ...               ...   
191         0.0         1.0            1.0            0.0               1.0   
6482        0.0         1.0            1.0            0.0               0.0   
2564        0.0         1.0            0.0            1.0               1.0   
2730        0.0         1.0            0.0            1.0               0.0   
2376        0.0         1.0            1.0          



We change the value of the birth_date column to the age of each patient by applying the above declared function to the dataframe

In [14]:
df_num.death_date = df_num.apply(lambda x: deathDate_to_age(x.birth_date, x.death_date), axis=1)
#df_num.birth_date = df_num.birth_date.apply(birthDate_to_age)
for age in df_num.death_date:
    print(age)

0
0
0
0
0
71
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
77
76
0
0
66
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
104
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
96
69
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
68
0
0
0
73
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
65
0
0
43
0
0
0
0
0
0
39
100
38
0
0
0
54
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
80
0
0
0
0
0
0
0
0
0
0
49
0
73
0
0
0
0
0
0
0
0
0


Now we are going to eliminate the labels that do not give relevant information but would increase the computational cost of our data treatments.
e.g: Columns with too many different values, such as descriptions or names.