In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import os

### One hot encoding full real data

In [None]:
breast_survival_full = pd.read_csv("path_to_data/breast_survival_10-15_sample.csv")
breast_survival_full.shape

In [None]:
breast_survival_full['Sex'].unique()

# 1 Male
# 2 Female

In [None]:
breast_survival_full['CombinedSummaryStage'].unique()

# 1 Localized only
# 2 Regional by direct extension only
# 3 Regional lymph nodes involved only
# 4 Regional by both direct extension and lymph node involvement
# 7 Distant site(s)/node(s) involved
# 9 Unknown/unstaged/unspecified/DCO

In [None]:
breast_survival_full['Grade'].unique()

# 1 Well differentiated; Grade I
# 2 Moderately differentiated; Grade II
# 3 Poorly differentiated; Grade III
# 4 Undifferentiated; anaplastic; Grade IV
# 9 Unknown

In [None]:
breast_survival_full['SeqNum'].unique()

# 0 One primary only
# 1 1st of 2 or more primaries
# 2 2nd of 2 or more primaries
# 3 3rd of 3 or more primaries
# 4 4th of 4 or more primaries
# 5 5th of 5 or more primaries
# 6 6th of 6 or more primaries
# 7 7th of 7 or more primaries
# 8 8th of 8 or more primaries
# 11 11th of 11 or more primaries
# 20 20th of 20 or more primaries
# 99 Unknown seq num - federally required in situ or malig tumors

In [None]:
breast_survival_full['Age'].unique()

In [None]:
breast_survival_full['SurvivalMonths'].unique()

In [None]:
survival_months_dict = {'SurvivalMonths': {'0072':72, '0059':59, '0081':81, '0065':65, '0088':88,
       '0098':98, '0107':107, '0080':80, '0037':37, '0062':62, '0052':52, '0116':116, '0013':13,
       '0054':54, '0049':49, '0056':56, '0108':108, '0091':91, '0070':70, '0111':111, '0090':90,
       '0101':101, '0112':112, '0051':51, '0100':100, '0053':53, '0095':95, '0073':73, '0103':103,
       '0022':22, '0074':74, '0110':110, '0033':33, '0031':31, '0005':5, '0086':86, '0060':60,
       '0096':96, '0050':50, '0010':10, '0089':89, '0035':35, '0109':109, '0066':66, '0087':87,
       '0067':67, '0058':58, '0082':82, '0105':105, '0093':93, '0057':57, '0048':48, '0079':79,
       '0071':71, '0099':99, '0041':41, '0075':75, '0102':102, '0097':97, '0114':114, '0094':94,
       '0078':78, '0076':76, '0104':104, '0077':77, '0083':83, '0113':113, '0061':61, '0115':115,
       '0007':7, '0064':64, '0119':119, '0118':118, '0008':8, '0092':92, '0084':84, '0028':28,
       '0018':18, '0068':68, '0117':117, '0063':63, '0085':85, '0055':55, '0046':46, '0006':6,
       '0042':42, '0032':32, '0069':69, '0001':1, '0106':106, '0034':34, '0004':4, '0015':15,
       '0003':3, '0021':21, '0019':19, '0047':47, '0012':12, '0027':27, '0016':16, '0024':24,
       '0043':43, '0014':14, '0020':20, '0000':0, '0038':38, '0040':40, '0002':2, '0039':39,
       '0009':9, '0030':30, '0023':23, '0045':45, '0011':11, '0026':26, '0044':44, '0029':29,
       '0017':17, '0025':25, '0036':36}}

In [None]:
replace_dict = {'Sex': {'Male': 1, 'Female': 2}, 'YearDx':{2010:1, 2011:2, 2012:3, 2013:4, 2014:5, 2015:6}, \
                'Age':{'15-19 years':1, '20-24 years':2, '25-29 years':3, '30-34 years':4, '35-39 years':5, \
                      '40-44 years':6, '45-49 years':7, '50-54 years':8, '55-59 years':9, '60-64 years':10, '65-69 years':11, \
                      '70-74 years':12, '75-79 years':13, '80-84 years':14, '85+ years':15}, \
                'CombinedSummaryStage': {'Localized only':1, 'Regional by direct extension only':2, 'Regional lymph nodes involved only':3, \
                 'Regional by both direct extension and lymph node involvement':4, 'Distant site(s)/node(s) involved':7, 'Unknown/unstaged/unspecified/DCO':9}, \
                'Grade': {'Well differentiated; Grade I':1, 'Moderately differentiated; Grade II':2, 'Poorly differentiated; Grade III':3, \
                          'Undifferentiated; anaplastic; Grade IV':4, 'Unknown':9}, \
               'SeqNum':{'One primary only':0, '1st of 2 or more primaries':1, '2nd of 2 or more primaries':2, \
                        '3rd of 3 or more primaries':3, '4th of 4 or more primaries':4, '5th of 5 or more primaries':5, \
                        '6th of 6 or more primaries':6, '7th of 7 or more primaries':7, '8th of 8 or more primaries':8, \
                        '9th of 9 or more primaries':9, '10th of 10 or more primaries':10, '11th of 11 or more primaries':11, \
                        '12th of 12 or more primaries':12, '20th of 20 or more primaries':20, 'Unknown seq num - federally required in situ or malig tumors':99,},\
                'VitalStatus':{'Alive':0, 'Dead':1}}

In [None]:
num_cols = ['Age', 'YearDx', 'Sex', 'Grade', 'SeqNum', 'CombinedSummaryStage']
ohe_cols = ['Race', 'ICDO3', 'Laterality', 'PrimarySite-labeled', 'DiagnosticConfirmation', 'ICCCSite']

In [None]:
# Apply label encoding on selected columns
breast_survival_full = breast_survival_full.replace(replace_dict)
breast_survival_full = breast_survival_full.replace(survival_months_dict)
breast_survival_full['SurvivalMonths'] = pd.to_numeric(breast_survival_full['SurvivalMonths'], errors='coerce')
breast_survival_full.describe()

In [None]:
# Train OHE transformation on full real dataset
df_full = breast_survival_full.copy()
ohe = OneHotEncoder()
feature_arr = ohe.fit_transform(df_full[ohe_cols]).toarray()
ohe_labels = ohe.get_feature_names(ohe_cols)
features = pd.DataFrame(
               feature_arr,
               columns=ohe_labels)
df_full = df_full.drop(ohe_cols,axis = 1)
df_full = df_full.join(features)
df_full

### Label encoding and one hot encoding holdout data

In [None]:
breast_survival = pd.read_csv("path_to_data/holdout.csv")
breast_survival

In [None]:
# Apply label encoding on selected columns
breast_survival = breast_survival.replace(replace_dict)
breast_survival = breast_survival.replace(survival_months_dict)
breast_survival['SurvivalMonths'] = pd.to_numeric(breast_survival['SurvivalMonths'], errors='coerce')
breast_survival.to_csv('holdout_le.csv', index=False)

In [None]:
# Apply OHE transformation on holdout dataset
df = breast_survival.copy()
feature_arr = ohe.transform(df[ohe_cols]).toarray()
ohe_labels = ohe.get_feature_names(ohe_cols)
features = pd.DataFrame(
               feature_arr,
               columns=ohe_labels)
df = df.drop(ohe_cols,axis = 1)
df = df.join(features)
df

In [None]:
# Normalize numerical columns and save the dataframe
scaler = StandardScaler()
df[num_cols]=scaler.fit_transform(df[num_cols])
df.to_csv(os.path.join('path_to_data','holdout_ohe.csv'), index=False)
df

### Label encoding and one hot encoding synthetic data

In [None]:
breast_survival_synth = pd.read_csv('path_to_data/synth1M_duplicates.csv')
breast_survival_synth

In [None]:
# Dropping duplicate rows
breast_survival_synth = breast_survival_synth.drop_duplicates(subset=breast_survival_synth.columns.difference(['Unnamed: 0'])).reset_index(drop=True)
breast_survival_synth.to_csv('synth1M.csv', index=False)
breast_survival_synth

In [None]:
# Apply label encoding on selected columns
breast_survival_synth = breast_survival_synth.replace(replace_dict)
breast_survival_synth = breast_survival_synth.replace(survival_months_dict)
breast_survival_synth['SurvivalMonths'] = pd.to_numeric(breast_survival_synth['SurvivalMonths'], errors='coerce')
breast_survival_synth.to_csv('synth1M_le.csv', index=False)
breast_survival_synth

In [None]:
# Apply OHE transformation on 1M synthetic dataset
df = breast_survival_synth.copy()
feature_arr = ohe.transform(df[ohe_cols]).toarray()
ohe_labels = ohe.get_feature_names(ohe_cols)
features = pd.DataFrame(
               feature_arr,
               columns=ohe_labels)
df = df.drop(ohe_cols,axis = 1)
df = df.join(features)
df

In [None]:
# Normalize numerical columns and save the dataframe
df[num_cols]=scaler.transform(df[num_cols])
df.to_csv(os.path.join('path_to_data','synth1M_ohe.csv'), index=False)
df