In [2]:
# Necessary imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load your dataset
data= pd.read_csv("C:\\Users\\MASSIVE\\Downloads\\datathon\\train.csv")

# Drop irrelevant features
data = data.drop(columns=["Patient_ID"])

In [3]:
data.head()

Unnamed: 0,Age,Marital_Status,Year of Operation,Positive_Axillary_Nodes,Tumor_Size,Radiation_Therapy,Chemotherapy,Hormone_Therapy,Survival_Status
0,77,Married,1962,5,3.0,No,Yes,No,1
1,36,Married,1964,2,1.9,Yes,No,No,1
2,47,Married,1960,5,2.0,No,No,No,0
3,54,Married,1965,0,1.4,No,No,No,0
4,35,Single,1968,5,4.1,Yes,Yes,Yes,1


In [4]:
data['Marital_Status'] = data['Marital_Status'].map({'Married': 1, 'Single': 0})
data['Radiation_Therapy'] = data['Radiation_Therapy'].map({'Yes': 1, 'No': 0})
data['Chemotherapy'] = data['Chemotherapy'].map({'Yes': 1, 'No': 0})
data['Hormone_Therapy'] = data['Hormone_Therapy'].map({'Yes': 1, 'No': 0})

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Separate features and target
X = data.drop("Survival_Status", axis=1)
y = data["Survival_Status"]

# Assuming 'X' is your feature DataFrame and contains the relevant columns
X['Age_to_Year_Ratio'] = X['Age'] / X['Year of Operation']
X['Tumor_Size_to_Positive_Nodes_Ratio'] = X['Tumor_Size'] / (X['Positive_Axillary_Nodes'] + 1)  # Avoid division by zero
X['Treatment_Interaction'] = X['Radiation_Therapy'] * X['Chemotherapy']
X['Age_Times_Tumor_Size'] = X['Age'] * X['Tumor_Size']
X['Years_Since_Operation'] = X['Year of Operation'] - X['Age']
X['Positive_Nodes_Squared'] = X['Positive_Axillary_Nodes'] ** 2

# Now X has the new features added


# Polynomial features only for numeric columns
numeric_features = ["Age", "Tumor_Size", "Positive_Axillary_Nodes", "Year of Operation"]
categorical_features = ["Marital_Status", "Radiation_Therapy", "Chemotherapy", "Hormone_Therapy"]

poly = PolynomialFeatures(degree=2, interaction_only=True)
X_numeric_poly = poly.fit_transform(X[numeric_features])

# Create a new DataFrame with polynomial and categorical data
X_poly_df = pd.DataFrame(X_numeric_poly, columns=poly.get_feature_names_out(numeric_features))
X_processed = pd.concat([X_poly_df, X[categorical_features].reset_index(drop=True)], axis=1)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Preprocessing pipeline for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), slice(0, X_numeric_poly.shape[1])),  # Scales polynomial features
        ("cat", OneHotEncoder(drop='first'), slice(X_numeric_poly.shape[1], X_processed.shape[1]))
    ]
)

# Model pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))
])

# Train the model
model.fit(X_train_res, y_train_res)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 55.33%


In [17]:
X.head()

Unnamed: 0,Age,Marital_Status,Year of Operation,Positive_Axillary_Nodes,Tumor_Size,Radiation_Therapy,Chemotherapy,Hormone_Therapy,Age Marital_Status,Age Year of Operation,...,Positive_Axillary_Nodes Radiation_Therapy,Positive_Axillary_Nodes Chemotherapy,Positive_Axillary_Nodes Hormone_Therapy,Tumor_Size Radiation_Therapy,Tumor_Size Chemotherapy,Tumor_Size Hormone_Therapy,Radiation_Therapy Chemotherapy,Radiation_Therapy Hormone_Therapy,Chemotherapy Hormone_Therapy,Age_Category
0,77.0,1.0,1962.0,5.0,3.0,0.0,1.0,0.0,77.0,151074.0,...,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,Old
1,36.0,1.0,1964.0,2.0,1.9,1.0,0.0,0.0,36.0,70704.0,...,2.0,0.0,0.0,1.9,0.0,0.0,0.0,0.0,0.0,Middle_Aged
2,47.0,1.0,1960.0,5.0,2.0,0.0,0.0,0.0,47.0,92120.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Middle_Aged
3,54.0,1.0,1965.0,0.0,1.4,0.0,0.0,0.0,54.0,106110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Middle_Aged
4,35.0,0.0,1968.0,5.0,4.1,1.0,1.0,1.0,0.0,68880.0,...,5.0,5.0,5.0,4.1,4.1,4.1,1.0,1.0,1.0,Middle_Aged


In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score

# Separate features and target
X = data.drop("Survival_Status", axis=1)
y = data["Survival_Status"]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define numeric and categorical features
numeric_features = ["Age", "Tumor_Size", "Positive_Axillary_Nodes", "Year of Operation"]
categorical_features = ["Marital_Status", "Radiation_Therapy", "Chemotherapy", "Hormone_Therapy"]

# Preprocessing: Standardize numeric features and OneHotEncode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(drop='first'), categorical_features),
    ]
)

# Build a KNN model pipeline
knn_model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier())
])

# Define the parameter grid for tuning
param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 9, 11],  # Example values for K
    'classifier__weights': ['uniform', 'distance'],
}

# Perform grid search to find the best parameters
grid_search = GridSearchCV(knn_model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best parameters
print(f"Best Parameters: {grid_search.best_params_}")

# Make predictions
y_pred = grid_search.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")


Best Parameters: {'classifier__n_neighbors': 3, 'classifier__weights': 'distance'}
Accuracy: 56.33%


In [6]:
from xgboost import XGBClassifier

# Replace RandomForest with XGBoost in the pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
])

# Train the model
model.fit(X_train_res, y_train_res)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"XGBoost Accuracy: {accuracy * 100:.2f}%")


XGBoost Accuracy: 51.67%


In [7]:
from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_depth": [3, 5, 10, None],
    "classifier__learning_rate": [0.01, 0.05, 0.1, 0.2]
}

random_search = RandomizedSearchCV(model, param_dist, n_iter=10, cv=5, random_state=42)
random_search.fit(X_train_res, y_train_res)

print(f"Best Parameters: {random_search.best_params_}")


Best Parameters: {'classifier__n_estimators': 200, 'classifier__max_depth': 3, 'classifier__learning_rate': 0.1}


In [9]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE  # Optional for handling class imbalance

# Assuming 'data' is your DataFrame
# Separate features and target
X = data.drop("Survival_Status", axis=1)  # Replace with actual target column name
y = data["Survival_Status"]

# Feature Engineering
# Example: Categorizing ages into bins
X['Age_Category'] = pd.cut(X['Age'], bins=[0, 30, 60, 90], labels=['Young', 'Middle_Aged', 'Old'])

# Adding polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_numeric = X.select_dtypes(include=[np.number])  # Only numeric columns for polynomial features
X_poly = poly.fit_transform(X_numeric)

# Create a DataFrame for the polynomial features and keep original categorical columns
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X_numeric.columns))

# Concatenate original categorical features with new polynomial features
X = pd.concat([X_poly_df, X[['Marital_Status', 'Radiation_Therapy', 'Chemotherapy', 'Hormone_Therapy', 'Age_Category']].reset_index(drop=True)], axis=1)

# Remove duplicate column names if any
X = X.loc[:, ~X.columns.duplicated()]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline for numerical and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = ['Marital_Status', 'Radiation_Therapy', 'Chemotherapy', 'Hormone_Therapy', 'Age_Category']

# Preprocessor with imputer and scaler
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="mean")), 
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", OneHotEncoder(drop='first'), categorical_features),
    ]
)

# Build a model pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier())
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    "classifier__n_neighbors": np.arange(1, 21),  # Try neighbors from 1 to 20
    "classifier__weights": ['uniform', 'distance']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Cross-validation to get a better estimate of model performance
cv_scores = cross_val_score(best_model, X, y, cv=5)
print(f"Mean CV Accuracy: {np.mean(cv_scores) * 100:.2f}%")

# Optionally, use SMOTE for oversampling (uncomment if needed)
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train the best model
best_model.fit(X_train, y_train)

# Evaluate on test set
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.4s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.1s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.1s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.1s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.1s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.1s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.1s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.1s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.1s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.1s
[CV] END classifier__n_neighbors=2, classifier__weights=uniform; total time=   0.1s
[CV] END 

In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE  # Optional for handling class imbalance

# Assuming 'data' is your DataFrame
# Separate features and target
X = data.drop("Survival_Status", axis=1)  # Replace with actual target column name
y = data["Survival_Status"]

# Feature Engineering
# Example: Categorizing ages into bins
X['Age_Category'] = pd.cut(X['Age'], bins=[0, 30, 60, 90], labels=['Young', 'Middle_Aged', 'Old'])



# Adding polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_numeric = X.select_dtypes(include=[np.number])  # Only numeric columns for polynomial features
X_poly = poly.fit_transform(X_numeric)

# Create a DataFrame for the polynomial features and keep original categorical columns
X_poly_df = pd.DataFrame(X_poly, columns=poly.get_feature_names_out(X_numeric.columns))

# Concatenate original categorical features with new polynomial features
X = pd.concat([X_poly_df, X[['Marital_Status', 'Radiation_Therapy', 'Chemotherapy', 'Hormone_Therapy', 'Age_Category']].reset_index(drop=True)], axis=1)

# Remove duplicate column names if any
X = X.loc[:, ~X.columns.duplicated()]

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocessing pipeline for numerical and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = ['Marital_Status', 'Radiation_Therapy', 'Chemotherapy', 'Hormone_Therapy', 'Age_Category']

# Preprocessor with imputer and scaler
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="mean")), 
            ("scaler", StandardScaler())
        ]), numeric_features),
        ("cat", OneHotEncoder(drop='first'), categorical_features),
    ]
)

# Build a model pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", KNeighborsClassifier())
])

# Hyperparameter tuning using GridSearchCV
param_grid = {
    "classifier__n_neighbors": np.arange(1, 21),  # Try neighbors from 1 to 20
    "classifier__weights": ['uniform', 'distance']
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Cross-validation to get a better estimate of model performance
cv_scores = cross_val_score(best_model, X, y, cv=5)
print(f"Mean CV Accuracy: {np.mean(cv_scores) * 100:.2f}%")

# Optionally, use SMOTE for oversampling (uncomment if needed)
# smote = SMOTE(random_state=42)
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Train the best model
best_model.fit(X_train, y_train)

# Evaluate on test set
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=uniform; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=1, classifier__weights=distance; total time=   0.0s
[CV] END classifier__n_neighbors=2, classifier__weights=uniform; total time=   0.0s
[CV] END 

In [14]:
from sklearn.ensemble import RandomForestClassifier

# Update your model pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(random_state=42))  # Using Random Forest
])

# Hyperparameter tuning using GridSearchCV for Random Forest
param_grid = {
    "classifier__n_estimators": [50, 100, 200],
    "classifier__max_depth": [None, 10, 20, 30],
    "classifier__min_samples_split": [2, 5, 10]
}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', verbose=2)
grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluate on test set
test_accuracy = best_model.score(X_test, y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50; total time=   0.3s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50; total time=   0.5s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50; total time=   0.4s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50; total time=   0.5s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=50; total time=   0.5s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.7s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=100; total time=   0.9s
[CV] END classifier__max_depth=None, classifier__min_samples_split=2, classifier__n_estimators=100; total time

In [12]:
# Load the new data file (replace 'new_data.csv' with the actual file path)
new_data = pd.read_csv('C:\\Users\\MASSIVE\\Downloads\\datathon\\test.csv')

# Check the new data to confirm everything is in order
print(new_data.head())


   Patient_ID  Age Marital_Status  Year of Operation  Positive_Axillary_Nodes  \
0        1501   62         Single               1966                        4   
1        1502   33        Married               1960                        4   
2        1503   52        Married               1963                        7   
3        1504   56        Married               1968                        7   
4        1505   70        Married               1968                        1   

   Tumor_Size Radiation_Therapy Chemotherapy Hormone_Therapy  
0         1.4               Yes           No              No  
1         3.8               Yes           No              No  
2         2.1               Yes           No              No  
3         1.6               Yes           No              No  
4         4.1               Yes           No             Yes  


In [14]:
# Map categorical values in the new data
new_data['Marital_Status'] = new_data['Marital_Status'].map({'Married': 1, 'Single': 0})
new_data['Radiation_Therapy'] = new_data['Radiation_Therapy'].map({'Yes': 1, 'No': 0})
new_data['Chemotherapy'] = new_data['Chemotherapy'].map({'Yes': 1, 'No': 0})
new_data['Hormone_Therapy'] = new_data['Hormone_Therapy'].map({'Yes': 1, 'No': 0})

# Verify the mappings
print(new_data.head())


   Patient_ID  Age  Marital_Status  Year of Operation  \
0        1501   62               0               1966   
1        1502   33               1               1960   
2        1503   52               1               1963   
3        1504   56               1               1968   
4        1505   70               1               1968   

   Positive_Axillary_Nodes  Tumor_Size  Radiation_Therapy  Chemotherapy  \
0                        4         1.4                  1             0   
1                        4         3.8                  1             0   
2                        7         2.1                  1             0   
3                        7         1.6                  1             0   
4                        1         4.1                  1             0   

   Hormone_Therapy  Age_to_Year_Ratio  Tumor_Size_to_Positive_Nodes_Ratio  
0                0           0.031536                              0.2800  
1                0           0.016837                 

In [18]:
# Apply feature engineering to new data
new_data['Age_to_Year_Ratio'] = new_data['Age'] / new_data['Year of Operation']
new_data['Tumor_Size_to_Positive_Nodes_Ratio'] = new_data['Tumor_Size'] / (new_data['Positive_Axillary_Nodes'] + 1)  # Avoid division by zero
new_data['Treatment_Interaction'] = new_data['Radiation_Therapy'] * new_data['Chemotherapy']
new_data['Age_Times_Tumor_Size'] = new_data['Age'] * new_data['Tumor_Size']
new_data['Years_Since_Operation'] = new_data['Year of Operation'] - new_data['Age']
new_data['Positive_Nodes_Squared'] = new_data['Positive_Axillary_Nodes'] ** 2

# Check the new features
new_data.head()


Unnamed: 0,Patient_ID,Age,Marital_Status,Year of Operation,Positive_Axillary_Nodes,Tumor_Size,Radiation_Therapy,Chemotherapy,Hormone_Therapy,Age_to_Year_Ratio,Tumor_Size_to_Positive_Nodes_Ratio,Treatment_Interaction,Age_Times_Tumor_Size,Years_Since_Operation,Positive_Nodes_Squared
0,1501,62,0,1966,4,1.4,1,0,0,0.031536,0.28,0,86.8,1904,16
1,1502,33,1,1960,4,3.8,1,0,0,0.016837,0.76,0,125.4,1927,16
2,1503,52,1,1963,7,2.1,1,0,0,0.02649,0.2625,0,109.2,1911,49
3,1504,56,1,1968,7,1.6,1,0,0,0.028455,0.2,0,89.6,1912,49
4,1505,70,1,1968,1,4.1,1,0,1,0.035569,2.05,0,287.0,1898,1


In [16]:
X.head()


Unnamed: 0,Age,Marital_Status,Year of Operation,Positive_Axillary_Nodes,Tumor_Size,Radiation_Therapy,Chemotherapy,Hormone_Therapy,Age Marital_Status,Age Year of Operation,...,Positive_Axillary_Nodes Radiation_Therapy,Positive_Axillary_Nodes Chemotherapy,Positive_Axillary_Nodes Hormone_Therapy,Tumor_Size Radiation_Therapy,Tumor_Size Chemotherapy,Tumor_Size Hormone_Therapy,Radiation_Therapy Chemotherapy,Radiation_Therapy Hormone_Therapy,Chemotherapy Hormone_Therapy,Age_Category
0,77.0,1.0,1962.0,5.0,3.0,0.0,1.0,0.0,77.0,151074.0,...,0.0,5.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,Old
1,36.0,1.0,1964.0,2.0,1.9,1.0,0.0,0.0,36.0,70704.0,...,2.0,0.0,0.0,1.9,0.0,0.0,0.0,0.0,0.0,Middle_Aged
2,47.0,1.0,1960.0,5.0,2.0,0.0,0.0,0.0,47.0,92120.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Middle_Aged
3,54.0,1.0,1965.0,0.0,1.4,0.0,0.0,0.0,54.0,106110.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Middle_Aged
4,35.0,0.0,1968.0,5.0,4.1,1.0,1.0,1.0,0.0,68880.0,...,5.0,5.0,5.0,4.1,4.1,4.1,1.0,1.0,1.0,Middle_Aged
