In [23]:
import pandas as pd
from sklearn import preprocessing
from collections import defaultdict
from sklearn.impute import SimpleImputer

In [24]:
df = pd.read_excel('breast_cancer_data.xlsx', index_col='ehr')
print(df.isnull().any())

Unnamed: 0         False
side                True
neoadjuvant         True
grade               True
invasive            True
er_positive         True
pr_positive         True
her2_positive       True
ki67                True
birth_date         False
diagnosis_date     False
death_date          True
recurrence_year     True
menarche_age        True
menopause_age       True
pregnancy           True
abort               True
birth              False
caesarean           True
hist_type          False
dtype: bool


We delete the NULL values by using the Simple Imputer

In [25]:
imp_cat = SimpleImputer(strategy='most_frequent')
columns = df.columns
index = df.index
df_cat = pd.DataFrame(imp_cat.fit_transform(df), columns=columns, index=index)

print(df_cat.isnull().any())

Unnamed: 0         False
side               False
neoadjuvant        False
grade              False
invasive           False
er_positive        False
pr_positive        False
her2_positive      False
ki67               False
birth_date         False
diagnosis_date     False
death_date         False
recurrence_year    False
menarche_age       False
menopause_age      False
pregnancy          False
abort              False
birth              False
caesarean          False
hist_type          False
dtype: bool


Now we are going to eliminate the labels that do not give relevant information but would increase the computational cost of our data treatments.
e.g: Columns with too many different values, such as descriptions or names.

Now we are about to subsitute the categorical labels that are represented by strings with numerical values, in order to avoid working with Strings.

In [27]:
ohe = preprocessing.OneHotEncoder(sparse=False)
df_cat_ohe = pd.DataFrame(ohe.fit_transform(df_cat), 
    columns=ohe.get_feature_names_out(df_cat.columns.tolist()), index=df_cat.index)

print(df_cat_ohe)

      Unnamed: 0_AA  Unnamed: 0_AB  Unnamed: 0_AC  Unnamed: 0_AD  \
ehr                                                                
6849            1.0            0.0            0.0            0.0   
268             0.0            1.0            0.0            0.0   
1458            0.0            0.0            1.0            0.0   
268             0.0            0.0            0.0            1.0   
2013            0.0            0.0            0.0            0.0   
...             ...            ...            ...            ...   
191             0.0            0.0            0.0            0.0   
6482            0.0            0.0            0.0            0.0   
2564            0.0            0.0            0.0            0.0   
2730            0.0            0.0            0.0            0.0   
2376            0.0            0.0            0.0            0.0   

      Unnamed: 0_AE  Unnamed: 0_AF  Unnamed: 0_AG  Unnamed: 0_AH  \
ehr                                            