In [1]:
#import kagglehub - no longer needed once using local .csv file
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score
#from sklearn.preprocessing import LabelEncoder
from sksurv.ensemble import RandomSurvivalForest
from sksurv.metrics import concordance_index_censored
from sklearn.inspection import permutation_importance
from itertools import product

## Skip below after first run ##

In [None]:
path = kagglehub.dataset_download("reihanenamdari/breast-cancer")
path #path to data download on local machine

Downloading from https://www.kaggle.com/api/v1/datasets/download/reihanenamdari/breast-cancer?dataset_version_number=1...


100%|██████████| 42.8k/42.8k [00:00<00:00, 1.22MB/s]

Extracting files...





In [2]:
df = pd.read_csv('Breast_Cancer.csv')

In [3]:
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [4]:
#Drop columns - drops Race, Marital Status, 6th stage, differentiate, A stage
df.drop(df.columns[[1,2,5,6,8]], axis=1, inplace=True)

In [5]:
df.columns

Index(['Age', 'T Stage ', 'N Stage', 'Grade', 'Tumor Size', 'Estrogen Status',
       'Progesterone Status', 'Regional Node Examined',
       'Reginol Node Positive', 'Survival Months', 'Status'],
      dtype='object')

In [6]:
#Encode Categorical Columns
df['T Stage '] = df['T Stage '].map({'T1':1,'T2':2, 'T3':3,'T4':4})
df['N Stage'] = df['N Stage'].map({'N1':1,'N2':2, 'N3':3})
df['Estrogen Status'] = df['Estrogen Status'].map({'Positive':1,'Negative':0})
df['Progesterone Status'] = df['Progesterone Status'].map({'Positive': 1,'Negative': 0})
df['Status'] = df['Status'].map({'Alive':1,'Dead':0})
#Force to numeric and drop those with missing grades
df['Grade'] = pd.to_numeric(df['Grade'], errors = 'coerce')
df = df.dropna(subset = ['Grade'])

In [7]:
#check shape of dataset
print(df.shape)

(4005, 11)


In [8]:
#Create a survival object dataframe
y = df[['Status','Survival Months']]

In [9]:
y.head()

Unnamed: 0,Status,Survival Months
0,1,60
1,1,62
2,1,75
3,1,84
4,1,50


Below will create a list of tuples, 'Status' and 'Survival Months' by row and keeps Status boolean
and Survival Months as a 64 bit float

These two variables define survival time and status as target variables.

This helps us use sckit-surival library for analysis.  It requires this tuple format.S

In [11]:
y_structured = np.array([(bool(status), months) for status, months in zip(y['Status'], y['Survival Months'])],
                        dtype = [('Status','bool'), ('Survival Months', 'f8')])

In [12]:
y_structured 
# Status of True means event happened (Death)

array([( True,  60.), ( True,  62.), ( True,  75.), ..., ( True,  69.),
       ( True,  72.), ( True, 100.)],
      shape=(4005,), dtype=[('Status', '?'), ('Survival Months', '<f8')])

In [13]:
#Removes target columns from original dataframe, which contains our features only
X= df.drop(columns = ['Status','Survival Months'])

In [14]:
#X becomes our Independent variables
X.head()

Unnamed: 0,Age,T Stage,N Stage,Grade,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive
0,68,1,1,3.0,4,1,1,24,1
1,50,2,2,2.0,35,1,1,14,5
2,58,3,3,2.0,63,1,1,14,7
3,58,1,1,3.0,18,1,1,2,1
4,47,2,1,3.0,41,1,1,3,1


**Feature and Model Selection**

We do permutation based feature importance to identify top 5 features.  This reduces dimensionality and focuses on variables we think are more predictive.

Random Survival Forest is chosen as the model approach due to its robustness with non-linear relationships and handling high dimensional data.  Random Survival Forest is good for predicting survival time distributions.

In [15]:
# Fit a random survival forest to get feature importances
# 100 trees, a node must have 10 samples to split with each leaf having at least 15 samples, random_state allows reproducibility
rsf = RandomSurvivalForest(n_estimators = 100, min_samples_split = 10, min_samples_leaf=15, n_jobs = -1, random_state = 42)
rsf.fit(X, y_structured)


In [16]:
# Custom scoring function for survival analysis - closer to 1 is better.  0.5 is a random guess.  
# C-index measures how well the model predicts who survives longer
def cindex_score(model, x, y):
    prediction = model.predict(x)
    result = concordance_index_censored(
        y['Status'], y['Survival Months'], prediction
    )[0]
    return result

# permutation based feature importance
# This evaluates how much each feature impacts performance by randomly shuffling values and seeing how much performance drops.
# More drop = more important feature
result = permutation_importance (
    estimator = rsf,
    X=X,
    y=y_structured,
    n_repeats = 10,
    random_state = 42,
    n_jobs = -1,
    scoring = cindex_score #use custom Concordance Index scorer
)

importance_df = pd.Series(result.importances_mean, index=X.columns).sort_values(ascending = False)
print("Permutation importances:\n", importance_df)

Permutation importances:
 Age                       0.080337
Regional Node Examined    0.068393
Tumor Size                0.065415
Reginol Node Positive     0.048745
Progesterone Status       0.035024
Grade                     0.031666
N Stage                   0.015826
T Stage                   0.013905
Estrogen Status           0.002929
dtype: float64


In [17]:
# Top 5 features
top_features = importance_df.head(5).index.tolist()
X_selected = X[top_features]

In [18]:
X_selected

Unnamed: 0,Age,Regional Node Examined,Tumor Size,Reginol Node Positive,Progesterone Status
0,68,24,4,1,1
1,50,14,35,5,1
2,58,14,63,7,1
3,58,2,18,1,1
4,47,3,41,1,1
...,...,...,...,...,...
4019,62,1,9,1,1
4020,56,14,46,8,1
4021,68,11,22,3,0
4022,58,11,44,1,1


**Hyperparameter Tuning**

Simply, we look at all the trees and number of features selected at each split.  This aims to optimize Concordance Index, which is a standard metric in survival analysis.

In [19]:
# Hyperparameter tuning using grid search - accuracy doesn't work for GridSearchCV
# Create manual grid search instead
best_score = -np.inf
best_parames = None

#so let's evaluate 100,200, 300 trees with between 1-5 features
for n_est, max_feat in product([100,200,300],[1,2,3,4,5]):
    model = RandomSurvivalForest(
        n_estimators=n_est,
        max_features = max_feat,
        min_samples_leaf=15,
        min_samples_split=10,
        n_jobs=-1,
        random_state=42
    )
    model.fit(X_selected,y_structured)
    score = cindex_score(model,X_selected,y_structured)
    
    print(f"Params: n_estimators={n_est}, max_features = {max_feat}, c-index={score: .4f}")
    
    if score > best_score:
        best_score = score
        best_params = (n_est,max_feat)

print(f"\n Best Params: {best_params}, Best C-index: {best_score: .4f}")

Params: n_estimators=100, max_features = 1, c-index= 0.6539
Params: n_estimators=100, max_features = 2, c-index= 0.6619
Params: n_estimators=100, max_features = 3, c-index= 0.6677
Params: n_estimators=100, max_features = 4, c-index= 0.6733
Params: n_estimators=100, max_features = 5, c-index= 0.6765
Params: n_estimators=200, max_features = 1, c-index= 0.6554
Params: n_estimators=200, max_features = 2, c-index= 0.6647
Params: n_estimators=200, max_features = 3, c-index= 0.6716
Params: n_estimators=200, max_features = 4, c-index= 0.6766
Params: n_estimators=200, max_features = 5, c-index= 0.6800
Params: n_estimators=300, max_features = 1, c-index= 0.6573
Params: n_estimators=300, max_features = 2, c-index= 0.6654
Params: n_estimators=300, max_features = 3, c-index= 0.6729
Params: n_estimators=300, max_features = 4, c-index= 0.6773
Params: n_estimators=300, max_features = 5, c-index= 0.6809

 Best Params: (300, 5), Best C-index:  0.6809


In [20]:
#Train best model
best_rsf = RandomSurvivalForest(
    n_estimators = best_params[0],
    max_features = best_params[1],
    min_samples_leaf=15,
    min_samples_split=10,
    n_jobs = -1,
    random_state = 42
)
best_rsf.fit(X_selected,y_structured)

In [21]:
#Prediction
pred_surv = best_rsf.predict(X_selected)

**Result interpretation**

0.5: Model performs no better than random chance

Greater than 0.7: Indicates good discriminative ability

Greater than 0.8: Indicates strong predictive performance.  A higher C-index suggests that the model effectively distinguishes between patients with different survival outcomes

In [22]:
#Evaluation
c_index = concordance_index_censored(
    event_indicator = y_structured['Status'],
    event_time = y_structured['Survival Months'],
    estimate = best_rsf.predict(X_selected)
)[0]

print('Concordance Index:', c_index)

Concordance Index: 0.6809212025020233
