In [3]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

df = pd.read_csv('xai_train_data.csv')
df.columns

Index(['Age recode with <1 year olds', 'Age recode with single ages and 90+',
       'CS tumor size (2004-2015)', 'CS extension (2004-2015)',
       'RX Summ--Surg Prim Site (1998+)', 'Sex',
       'Race recode (White, Black, Other)', 'Marital status at diagnosis',
       'Primary Site - labeled', 'Derived AJCC T, 6th ed (2004-2015)',
       'Derived AJCC N, 6th ed (2004-2015)',
       'Derived AJCC M, 6th ed (2004-2015)', 'Summary stage 2000 (1998-2017)',
       'Reason no cancer-directed surgery', 'Radiation recode',
       'Chemotherapy recode (yes, no/unk)',
       'RX Summ--Scope Reg LN Sur (2003+)', 'RX Summ--Surg/Rad Seq',
       'Median household income inflation adj to 2021',
       'Vital status recode (study cutoff used)', 'Survival months',
       'ICD-O-3 Hist/behav',
       'COD to site recode ICD-O-3 2023 Revision Expanded (1999+)'],
      dtype='object')

In [73]:
df['Summary stage 2000 (1998-2017)'].value_counts()

Localized    21678
Regional      2162
Distant        473
Name: Summary stage 2000 (1998-2017), dtype: int64

In [74]:
categorical_cols = []
categories = []

### Age - TODO divide into three total groups
df['Age_coded'] = [int(x[:2]) for x in df['Age recode with single ages and 90+']]

### Tumor size
df['Tumor_size_coded'] = df['CS tumor size (2004-2015)'].apply(np.log)

### Tumor extension -> Coded as OHE
tumor_ext_values = [100, 330, 400, 200, 300, 310, 500, 315, 350, 355, 375, 320, 340,
       360, 380, 800, 335, 370]
tumor_ext_values_str = [str(x) for x in tumor_ext_values]
categories.append(tumor_ext_values_str)
categorical_cols.append('CS extension (2004-2015)')

### RX Summ--Surg Prim Site (1998+) -> convert to int
df['Prim_site_coded'] = df['RX Summ--Surg Prim Site (1998+)'].astype(int)

### Sex binary variable
df['Sex_coded'] = [1 if x=='Female' else 0 for x in df['Sex']]

### Race -> Categorical variable
# define the mapping of old to new values
race_mapping = {
    'White': 'White',
    'Black': 'Other',
    'Other (American Indian/AK Native, Asian/Pacific Islander)': 'Other',
    'Unknown': 'Other'
}
df['Race_mapped'] = [1 if x=='White' else 0 for x in df['Race recode (White, Black, Other)']]


### Marital status -> OHE TBD how to combine them
marital_mapping = {
    'Married (including common law)': 'Married',
    'Widowed': 'Married',
    'Unmarried or Domestic Partner': 'Married',
    'Single (never married)': 'Single',
    'Unknown': 'Other',
    'Divorced': 'Other',
    'Separated': 'Other'
}

df['Marital_mapped'] = df['Marital status at diagnosis'].replace(marital_mapping)
marital_values = ['Single', 'Married', 'Other']
categories.append(marital_values)
categorical_cols.append('Marital_mapped')

### Primary Site - labeled -> OHE with the classes being HF, trunk and limbs
primary_site_mapping = { 
    'C44.3-Skin other/unspec parts of face': 'HF',
    'C44.4-Skin of scalp and neck': 'HF',
    'C44.2-External ear': 'HF',
    'C44.1-Eyelid': 'HF',
    'C44.0-Skin of lip, NOS': 'HF',
    
    'C44.5-Skin of trunk': 'Trunk',
    
    'C44.6-Skin of upper limb and shoulder': 'Limbs',
    'C44.7-Skin of lower limb and hip': 'Limbs',
    'C44.8-Overlapping lesion of skin': 'Limbs',
    'C44.9-Skin, NOS': 'Limbs'
}


df['Primary_Site_coded'] = df['Primary Site - labeled'].replace(primary_site_mapping)
primary_site_values = ['HF', 'Limbs', 'Trunk']
categories.append(primary_site_values)
categorical_cols.append('Primary_Site_coded')

### TNM -> OHE
# T
t_values = ['T0', 'T2a', 'T1a', 'T1NOS', 'T2b', 'T1b', 'T3b', 'T3a', 'TX', 'T4b',
       'T4a', 'T2NOS', 'T4NOS', 'T3NOS']
categories.append(t_values)
categorical_cols.append('Derived AJCC T, 6th ed (2004-2015)')

# N --- TODO N0 is 0 and the rest is 1
n_values = df['Derived AJCC N, 6th ed (2004-2015)'].unique()

categories.append(n_values)
categorical_cols.append('Derived AJCC N, 6th ed (2004-2015)')

# M TODO M0 is 0 and the rest is 1
m_values = df['Derived AJCC M, 6th ed (2004-2015)'].unique()

categories.append(m_values)
categorical_cols.append('Derived AJCC M, 6th ed (2004-2015)')

### Summary stage 2000 (1998-2017) -> Ordinal variable or OHE
summary_values = df['Summary stage 2000 (1998-2017)'].unique()

categories.append(summary_values)
categorical_cols.append('Summary stage 2000 (1998-2017)')

### Radiation recode -> binary variable
# create a dictionary of values to be replaced with 1
radiation_mapping = {'Beam radiation': 1, 'Radiation, NOS  method or source not specified': 1,
                'Radioactive implants (includes brachytherapy) (1988+)': 1}

df['Radiation_mapped'] = df['Radiation recode'].replace(radiation_mapping)
df['Radiation_mapped'] = df['Radiation recode'].apply(lambda x: 0 if x!=1 else x)

### Chemotherapy recode (yes, no/unk) -> Binary
df['Chemotherapy_coded'] = [1 if x=='yes' else 0 for x in df['Chemotherapy recode (yes, no/unk)']]

### RX Summ--Scope Reg LN Sur (2003+) -> None and Unknown or not applicable as NO and the rest YES
df['SLNB_Scope_coded'] = [0 if x in ['None', 'Unknown or not applicable'] else 1 for x in df['RX Summ--Scope Reg LN Sur (2003+)']]
                                       
### RX Summ--Surg/Rad Seq -> No rad... as NO the rest YES
df['RX Summ--Scope Reg LN Sur (2003+)'] = [0 if x=='No radiation and/or cancer-directed surgery' else 1 for x in df['RX Summ--Surg/Rad Seq']]

### Median household income inflation adj to 2021 -> Two to three categories OHE
# define a function to map income to categories
def income_categories(income):
    if income.startswith('$75,000+'):
        return 'high'
    elif income.startswith('$65,000') or income.startswith('$70,000'):
        return 'medium'
    else:
        return 'low'

# apply the function to the income column and create a new column for the categories
df['Income_coded'] = df['Median household income inflation adj to 2021'].apply(income_categories)


In [75]:
# instantiate the ColumnTransformer object
ct = ColumnTransformer([
        ("encoder", OneHotEncoder(categories=categories[1:], drop='first'), categorical_cols[1:])
    ], remainder="passthrough")

# fit the ColumnTransformer to the data and transform it
encoded_data = ct.fit_transform(df)

In [76]:
df['Income_coded'].value_counts()

high      12218
medium     6195
low        5900
Name: Income_coded, dtype: int64

In [66]:
# get columns that end with '_coded'
coded_cols = [col for col in df.columns if col.endswith('_coded')]

# get columns from categorical_variables list
for sublist in categorical_variables:
    for col in sublist:
        if col not in coded_cols:
            coded_cols.append(col)


[['Single', 'Married', 'Other'],
 ['HF', 'Limbs', 'Other'],
 ['T0',
  'T2a',
  'T1a',
  'T1NOS',
  'T2b',
  'T1b',
  'T3b',
  'T3a',
  'TX',
  'T4b',
  'T4a',
  'T2NOS',
  'T4NOS',
  'T3NOS'],
 array(['N0', 'NX', 'N1a', 'N2b', 'N3', 'N2a', 'N1NOS', 'N1b', 'N2c',
        'N2NOS', 'N1', 'N2'], dtype=object),
 array(['M0', 'MX', 'M1c', 'M1b', 'M1a', 'M1NOS'], dtype=object),
 array(['Localized', 'Regional', 'Distant'], dtype=object)]