In [None]:
#RF
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.statistics import logrank_test
import matplotlib.pyplot as plt
from sklearn.metrics import brier_score_loss
from lifelines.utils import concordance_index

# Function to load data
def load_data():
    train_data = pd.read_csv("D:/Apple/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/combination_survival/Total_GBM+LGG_t1+t2+t2Gd+flair_s_ICC_LASSO_survival_train.csv")
    test_data = pd.read_csv("D:/Apple/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/combination_survival/Total_GBM+LGG_t1+t2+t2Gd+flair_s_ICC_LASSO_survival_test.csv")
    return train_data, test_data
# Define `load_data` function to load training and testing datasets from specified file paths.
 
# Function to prepare data
# def prepare_data(train_data, test_data):
#     X_train = train_data.drop(columns=['index','age_at_index','gender'])
#     y_train = train_data['OS']
#     X_test = test_data.drop(columns=['index','age_at_index','gender'])
#     return X_train, y_train, X_test
#Function to prepare data
def prepare_data(train_data, test_data):
    X_train = train_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    y_train = train_data['OS']
    X_test = test_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    return X_train, y_train, X_test
# Define `prepare_data` function to extract features and labels from training and testing datasets:
# - Drop irrelevant columns (e.g., `OS.time`, `index`, etc.).
# - Return training features, training labels, and testing features.

# Function to train Random Forest classifier
def train_classifier(X_train, y_train):
    model = RandomForestClassifier(n_estimators=300, random_state=42)
    model.fit(X_train, y_train)
    return model
# Define `train_classifier` function to train a Random Forest classifier:
# - `n_estimators=300` specifies the number of trees in the forest.
# - Return the trained model.

# Function to predict and assign survival groups
def assign_survival_groups(model, train_data, test_data):
    # Combine train and test data
    combined_data = pd.concat([train_data, test_data], axis=0)
    X_combined = combined_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    
    # Predict probabilities using the trained model
    probabilities = model.predict_proba(X_combined)[:, 1]
    
    # Calculate median probability
    median_prob = np.median(probabilities)
    
    # Assign survival groups based on the calculated median probability
    combined_data['Survival_Group'] = ['Short Survival' if p >= median_prob else 'Long Survival' for p in probabilities]
    
    # Split the combined data back into train and test data
    train_data['Survival_Group'] = combined_data.iloc[:len(train_data)]['Survival_Group']
    test_data['Survival_Group'] = combined_data.iloc[len(train_data):]['Survival_Group']
    
    return train_data, test_data
# Define `assign_survival_groups` function to assign survival groups based on predicted probabilities:
# - Combine training and testing datasets.
# - Predict survival probabilities using the trained model.
# - Assign "Short Survival" or "Long Survival" based on the median probability.
# - Split the combined data back into training and testing datasets.

# Function to perform Kaplan-Meier analysis and plot results
def perform_km_analysis(test_data):
    plt.rcParams["font.family"] = "Times New Roman"  # Set font to Times New Roman
    plt.rcParams["font.size"] = 34   # Set font size.

    kmf_long = KaplanMeierFitter()
    # Create an instance of the Kaplan-Meier Fitter for the "Long Survival" group.
    # This will be used to fit and plot the survival curve for the "Long Survival" group.
    kmf_short = KaplanMeierFitter()
    # Create another instance of the Kaplan-Meier Fitter for the "Short Survival" group.
    # This will be used to fit and plot the survival curve for the "Short Survival" group. 
    long_survival_data = test_data[test_data['Survival_Group'] == 'Long Survival']
    short_survival_data = test_data[test_data['Survival_Group'] == 'Short Survival']
    T_long, E_long = long_survival_data['OS.time'], long_survival_data['OS']
    T_short, E_short = short_survival_data['OS.time'], short_survival_data['OS']

    # Fit Cox Proportional Hazards model
    df = pd.DataFrame({
        'time': np.concatenate([T_long, T_short]), # Combine survival times from both "Long Survival" and "Short Survival" groups into a single array.
        'event': np.concatenate([E_long, E_short]), # Combine event indicators (e.g., death or censoring) from both groups into a single array.
        'group': ['Long'] * len(T_long) + ['Short'] * len(T_short) # Create a "group" column with labels "Long" for the "Long Survival" group and "Short" for the "Short Survival group".
    })
    cph = CoxPHFitter() # Create an instance of the Cox Proportional Hazards model, which is used to analyze the relationship between survival time and predictor variables.
    cph.fit(df, 'time', event_col='event', formula='group')
    # Fit the Cox model using the provided DataFrame `df`.
    # - `time`: The column representing survival time.
    # - `event_col`: The column indicating whether the event (e.g., death) occurred (1) or was censored (0).
    # - `formula='group'`: Specifies that the "group" column (e.g., "Long" or "Short") is the predictor variable.

    hr = cph.summary.loc['group[T.Short]', 'exp(coef)']
    # Extract the hazard ratio (HR) for the "Short Survival" group compared to the "Long Survival" group.
   # The HR indicates the relative risk of the event occurring in the "Short Survival" group.

    ci_lower = cph.summary.loc['group[T.Short]', 'exp(coef) lower 95%'] # Extract the lower bound of the 95% confidence interval for the hazard ratio.
    ci_upper = cph.summary.loc['group[T.Short]', 'exp(coef) upper 95%'] # Extract the upper bound of the 95% confidence interval for the hazard ratio.

    fig, ax = plt.subplots(figsize=(12,10))  # Increase figure size
    kmf_long.fit(T_long, E_long, label='Long Survival').plot(ax=ax)
    kmf_short.fit(T_short, E_short, label='Short Survival').plot(ax=ax)
    
    ax.grid(True, linestyle='--', linewidth=0.5)  # Add horizontal grid lines
    ax.minorticks_on()
    ax.grid(which='minor', linestyle=':', linewidth=0.5)  # Add minor grid lines

    add_at_risk_counts(kmf_long, kmf_short, ax=ax, labels=['Long Survival', 'Short Survival'], ypos=-0.5)  # Move risk table down
    
    results = logrank_test(T_long, T_short, event_observed_A=E_long, event_observed_B=E_short)
    
    plt.title(f'p-value={results.p_value:.3f}, HR={hr:.2f} [95% CI: {ci_lower:.2f}-{ci_upper:.2f}]')
    ax.set_ylabel('Survival Probability')  # Ensure Y-axis label is set
    ax.set_xlabel('Time (days)')  # Hide the X-axis label by setting it to an empty string
    plt.tight_layout()  # Adjust layout to make room for the risk table
    plt.subplots_adjust(bottom=0.2)  # Adjust the space at the bottom for the X-axis label
    plt.show()
    return results
# Define `perform_km_analysis` function to perform Kaplan-Meier survival analysis and plot results:
# - Fit Kaplan-Meier curves for "Long Survival" and "Short Survival" groups.
# - Fit a Cox Proportional Hazards model to calculate hazard ratio (HR) and confidence intervals.
# - Plot Kaplan-Meier curves and add a risk table.


# Function to calculate c-index and Brier score
def calculate_metrics(model, test_data):
    # Extract features and true labels
    X_test = test_data.drop(columns=['OS.time', 'OS', 'index', 'age_at_index', 'gender', 'Survival_Group'])
    T_test = test_data['OS.time']
    E_test = test_data['OS']
    
    # Predict probabilities for the test set
    predicted_probabilities = model.predict_proba(X_test)[:, 1]
    
    # Calculate c-index
    c_index = concordance_index(T_test, -predicted_probabilities, E_test)
    
    # Calculate Brier score
    brier_score = brier_score_loss(E_test, predicted_probabilities)
    
    print(f"c-index: {c_index:.3f}")
    print(f"Brier score: {brier_score:.3f}")
    return c_index, brier_score
# Define `calculate_metrics` function to evaluate model performance:
# - c-index: Measures the concordance between predicted and actual survival times.
# - Brier score: Measures the accuracy of predicted probabilities.

# Main execution
try:
    train_data, test_data = load_data() # Load training and testing datasets.
    X_train, y_train, X_test = prepare_data(train_data, test_data) # Prepare training and testing data.
    model = train_classifier(X_train, y_train)  # Train the Random Forest classifier.
    train_data, test_data = assign_survival_groups(model, train_data, test_data) # Assign survival groups based on predicted probabilities.
    results = perform_km_analysis(test_data) # Perform Kaplan-Meier analysis and plot results.
    c_index, brier_score = calculate_metrics(model, test_data) # Calculate c-index and Brier score
except Exception as e:
    print(f"An error occurred: {e}")
# Main program execution:
# - Catch and print any errors that occur during execution.

In [None]:
#XGBOOST
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.statistics import logrank_test
import matplotlib.pyplot as plt
from sklearn.metrics import brier_score_loss
from lifelines.utils import concordance_index

# Function to load data
def load_data():
    train_data = pd.read_csv("D:/Apple/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/combination_survival/Total_GBM+LGG_t1+t2+t2Gd+flair_s_ICC_LASSO_survival_train.csv")
    test_data = pd.read_csv("D:/Apple/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/combination_survival/Total_GBM+LGG_t1+t2+t2Gd+flair_s_ICC_LASSO_survival_test.csv")
    return train_data, test_data
# Define `load_data` function to load training and testing datasets from specified file paths.

# Function to prepare data
def prepare_data(train_data, test_data):
    X_train = train_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    y_train = train_data['OS']
    X_test = test_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    return X_train, y_train, X_test
# Define `prepare_data` function to extract features and labels from training and testing datasets:
# - Drop irrelevant columns (e.g., `OS.time`, `index`, etc.).
# - Return training features, training labels, and testing features.

# Define a function to train a classifier using the XGBoost algorithm.
def train_classifier(X_train, y_train): 
    model = XGBClassifier(use_label_encoder=False)
    # Create an instance of the XGBoost classifier.
    # The parameter `use_label_encoder=False` disables the use of the label encoder, 
    # which is deprecated in newer versions of XGBoost.
    model.fit(X_train, y_train)
    # Train the XGBoost model using the training features (`X_train`) and labels (`y_train`).
    # The `fit` method adjusts the model parameters to minimize the loss function.
    return model

# Function to predict and assign survival groups，the same as the previous one(RF)
def assign_survival_groups(model, train_data, test_data):
    # Combine train and test data
    combined_data = pd.concat([train_data, test_data], axis=0)
    X_combined = combined_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    
    # Predict probabilities using the trained model
    probabilities = model.predict_proba(X_combined)[:, 1]
    
    # Calculate median probability
    median_prob = np.median(probabilities)
    
    # Assign survival groups based on the calculated median probability
    combined_data['Survival_Group'] = ['Short Survival' if p >= median_prob else 'Long Survival' for p in probabilities]
    
    # Split the combined data back into train and test data
    train_data['Survival_Group'] = combined_data.iloc[:len(train_data)]['Survival_Group']
    test_data['Survival_Group'] = combined_data.iloc[len(train_data):]['Survival_Group']
    
    return train_data, test_data

# Function to perform Kaplan-Meier analysis and plot results, the same as the previous one(RF)
def perform_km_analysis(test_data):
    plt.rcParams["font.family"] = "Times New Roman"  # Set font to Times New Roman
    plt.rcParams["font.size"] = 34  

    kmf_long = KaplanMeierFitter()
    kmf_short = KaplanMeierFitter()
    long_survival_data = test_data[test_data['Survival_Group'] == 'Long Survival']
    short_survival_data = test_data[test_data['Survival_Group'] == 'Short Survival']
    T_long, E_long = long_survival_data['OS.time'], long_survival_data['OS']
    T_short, E_short = short_survival_data['OS.time'], short_survival_data['OS']

    # Fit Cox Proportional Hazards model
    df = pd.DataFrame({
        'time': np.concatenate([T_long, T_short]),
        'event': np.concatenate([E_long, E_short]),
        'group': ['Long'] * len(T_long) + ['Short'] * len(T_short)
    })
    cph = CoxPHFitter()
    cph.fit(df, 'time', event_col='event', formula='group')
    hr = cph.summary.loc['group[T.Short]', 'exp(coef)']
    ci_lower = cph.summary.loc['group[T.Short]', 'exp(coef) lower 95%']
    ci_upper = cph.summary.loc['group[T.Short]', 'exp(coef) upper 95%']

    fig, ax = plt.subplots(figsize=(12,10))  # Increase figure size
    kmf_long.fit(T_long, E_long, label='Long Survival').plot(ax=ax)
    kmf_short.fit(T_short, E_short, label='Short Survival').plot(ax=ax)
    
    ax.grid(True, linestyle='--', linewidth=0.5)  # Add horizontal grid lines
    ax.minorticks_on()
    ax.grid(which='minor', linestyle=':', linewidth=0.5)  # Add minor grid lines

    add_at_risk_counts(kmf_long, kmf_short, ax=ax, labels=['Long Survival', 'Short Survival'], ypos=-0.5)  # Move risk table down
    
    results = logrank_test(T_long, T_short, event_observed_A=E_long, event_observed_B=E_short)
    
    plt.title(f'p-value={results.p_value:.3f}, HR={hr:.2f} [95% CI: {ci_lower:.2f}-{ci_upper:.2f}]')
    ax.set_ylabel('Survival Probability')  # Ensure Y-axis label is set
    ax.set_xlabel('Time (days)')  # Hide the X-axis label by setting it to an empty string
    plt.tight_layout()  # Adjust layout to make room for the risk table
    plt.subplots_adjust(bottom=0.2)  # Adjust the space at the bottom for the X-axis label
    plt.show()
    return results
def calculate_metrics(model, test_data):
    # Extract features and true labels
    X_test = test_data.drop(columns=['OS.time', 'OS', 'index', 'age_at_index', 'gender', 'Survival_Group'])
    T_test = test_data['OS.time']
    E_test = test_data['OS']
    
    # Predict probabilities for the test set
    predicted_probabilities = model.predict_proba(X_test)[:, 1]
    
    # Calculate c-index
    c_index = concordance_index(T_test, -predicted_probabilities, E_test)
    
    # Calculate Brier score
    brier_score = brier_score_loss(E_test, predicted_probabilities)
    
    print(f"c-index: {c_index:.3f}")
    print(f"Brier score: {brier_score:.3f}")
    return c_index, brier_score

# Main execution
try:
    train_data, test_data = load_data()
    X_train, y_train, X_test = prepare_data(train_data, test_data)
    model = train_classifier(X_train, y_train)
    train_data, test_data = assign_survival_groups(model, train_data, test_data)
    results = perform_km_analysis(test_data)
    # Calculate c-index and Brier score
    c_index, brier_score = calculate_metrics(model, test_data)
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
#LR
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.statistics import logrank_test
import matplotlib.pyplot as plt
from sklearn.metrics import brier_score_loss
from lifelines.utils import concordance_index

# Function to load data
def load_data():
    train_data = pd.read_csv("D:/Apple/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/combination_survival/Total_GBM+LGG_t1+t2+t2Gd+flair_s_ICC_LASSO_survival_train.csv")
    test_data = pd.read_csv("D:/Apple/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/combination_survival/Total_GBM+LGG_t1+t2+t2Gd+flair_s_ICC_LASSO_survival_test.csv")
    return train_data, test_data
# Define `load_data` function to load training and testing datasets from specified file paths.

# Function to prepare data
def prepare_data(train_data, test_data):
    X_train = train_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    y_train = train_data['OS']
    X_test = test_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    return X_train, y_train, X_test
# Define `prepare_data` function to extract features and labels from training and testing datasets:
# - Drop irrelevant columns (e.g., `OS.time`, `index`, etc.).
# - Return training features, training labels, and testing features.


# def train_classifier(X_train, y_train):
#     model = LogisticRegression(penalty="l2", solver="liblinear", C=1.0, max_iter=100)
#     model.fit(X_train, y_train)
#     return model
#Define 'train_classifier' function to train a logistic regression classifier:
def train_classifier(X_train, y_train):
    # Define the parameter grid for hyperparameter tuning
    param_grid = {'C': [1], 'solver': ['liblinear'], 'penalty': ['l2']}
    # - `C`: Regularization strength (smaller values specify stronger regularization).
    # - `solver`: Optimization algorithm, 'liblinear' is suitable for small datasets.
    # - `penalty`: Regularization type, 'l2' applies Ridge regularization.
    # Initialize Logistic Regression

    model = LogisticRegression(max_iter=100) # - `max_iter=100`: Maximum number of iterations for the solver to converge.
    
    # Apply GridSearchCV
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    # - `estimator`: The Logistic Regression model to be tuned.
    # - `param_grid`: The grid of hyperparameters to search.
    # - `cv=5`: Perform 5-fold cross-validation.
    # - `scoring='accuracy'`: Use accuracy as the evaluation metric.
    # - `n_jobs=-1`: Use all available CPU cores for parallel processing.
    
    grid_search.fit(X_train, y_train) # Fit the model to the training data and perform hyperparameter tuning.

    print("Best parameters found: ", grid_search.best_params_) # Output the best combination of hyperparameters found during the search.
    
    # Return the best model
    return grid_search.best_estimator_

# Function to predict and assign survival groups，the same as the previous one(RF)
def assign_survival_groups(model, train_data, test_data):
    # Combine train and test data
    combined_data = pd.concat([train_data, test_data], axis=0)
    X_combined = combined_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    
    # Predict probabilities using the trained model
    probabilities = model.predict_proba(X_combined)[:, 1]
    
    # Calculate median probability
    median_prob = np.median(probabilities)
    
    # Assign survival groups based on the calculated median probability
    combined_data['Survival_Group'] = ['Short Survival' if p >= median_prob else 'Long Survival' for p in probabilities]
    
    # Split the combined data back into train and test data
    train_data['Survival_Group'] = combined_data.iloc[:len(train_data)]['Survival_Group']
    test_data['Survival_Group'] = combined_data.iloc[len(train_data):]['Survival_Group']
    
    return train_data, test_data

# Function to perform Kaplan-Meier analysis and plot results,the same as the previous one(RF)
def perform_km_analysis(test_data):
    plt.rcParams["font.family"] = "Times New Roman"  # Set font to Times New Roman
    plt.rcParams["font.size"] = 34  

    kmf_long = KaplanMeierFitter()
    kmf_short = KaplanMeierFitter()
    long_survival_data = test_data[test_data['Survival_Group'] == 'Long Survival']
    short_survival_data = test_data[test_data['Survival_Group'] == 'Short Survival']
    T_long, E_long = long_survival_data['OS.time'], long_survival_data['OS']
    T_short, E_short = short_survival_data['OS.time'], short_survival_data['OS']

    # Fit Cox Proportional Hazards model
    df = pd.DataFrame({
        'time': np.concatenate([T_long, T_short]),
        'event': np.concatenate([E_long, E_short]),
        'group': ['Long'] * len(T_long) + ['Short'] * len(T_short)
    })
    cph = CoxPHFitter()
    cph.fit(df, 'time', event_col='event', formula='group')
    hr = cph.summary.loc['group[T.Short]', 'exp(coef)']
    ci_lower = cph.summary.loc['group[T.Short]', 'exp(coef) lower 95%']
    ci_upper = cph.summary.loc['group[T.Short]', 'exp(coef) upper 95%']

    fig, ax = plt.subplots(figsize=(12,10))  # Increase figure size
    kmf_long.fit(T_long, E_long, label='Long Survival').plot(ax=ax)
    kmf_short.fit(T_short, E_short, label='Short Survival').plot(ax=ax)
    
    ax.grid(True, linestyle='--', linewidth=0.5)  # Add horizontal grid lines
    ax.minorticks_on()
    ax.grid(which='minor', linestyle=':', linewidth=0.5)  # Add minor grid lines

    add_at_risk_counts(kmf_long, kmf_short, ax=ax, labels=['Long Survival', 'Short Survival'], ypos=-0.5)  # Move risk table down
    
    results = logrank_test(T_long, T_short, event_observed_A=E_long, event_observed_B=E_short)
    
    plt.title(f'p-value={results.p_value:.3f}, HR={hr:.2f} [95% CI: {ci_lower:.2f}-{ci_upper:.2f}]')
    ax.set_ylabel('Survival Probability')  # Ensure Y-axis label is set
    ax.set_xlabel('Time (days)')  # Hide the X-axis label by setting it to an empty string
    plt.tight_layout()  # Adjust layout to make room for the risk table
    plt.subplots_adjust(bottom=0.2)  # Adjust the space at the bottom for the X-axis label
    plt.show()
    # Output all p-values
    print("Log-rank test p-value:", results.p_value)
    print("Cox model p-values:")
    print(cph.summary['p'])
    
    return results
    return results
def calculate_metrics(model, test_data): #the same as the previous one(RF)
    # Extract features and true labels
    X_test = test_data.drop(columns=['OS.time', 'OS', 'index', 'age_at_index', 'gender', 'Survival_Group'])
    T_test = test_data['OS.time']
    E_test = test_data['OS']
    
    # Predict probabilities for the test set
    predicted_probabilities = model.predict_proba(X_test)[:, 1]
    
    # Calculate c-index
    c_index = concordance_index(T_test, -predicted_probabilities, E_test)
    
    # Calculate Brier score
    brier_score = brier_score_loss(E_test, predicted_probabilities)
    
    print(f"c-index: {c_index:.3f}")
    print(f"Brier score: {brier_score:.3f}")
    return c_index, brier_score

# Main execution,also the same as the previous one(RF)
try:
    train_data, test_data = load_data()
    X_train, y_train, X_test = prepare_data(train_data, test_data)
    model = train_classifier(X_train, y_train)
    train_data, test_data = assign_survival_groups(model, train_data, test_data)
    results = perform_km_analysis(test_data)
    # Calculate c-index and Brier score
    c_index, brier_score = calculate_metrics(model, test_data)
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
#SVM
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from lifelines import KaplanMeierFitter, CoxPHFitter
from lifelines.plotting import add_at_risk_counts
from lifelines.statistics import logrank_test
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import brier_score_loss
from lifelines.utils import concordance_index

# Function to load data
def load_data():
    train_data = pd.read_csv("D:/Apple/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/combination_survival/Total_GBM+LGG_t1+t2+t2Gd+flair_s_ICC_LASSO_survival_train.csv")
    test_data = pd.read_csv("D:/Apple/survival analysis/survival analysis/APPLE/t1+t1Gd+t2+flair/combination_survival/Total_GBM+LGG_t1+t2+t2Gd+flair_s_ICC_LASSO_survival_test.csv")
    return train_data, test_data
# Define `load_data` function to load training and testing datasets from specified file paths.

# Function to prepare data
def prepare_data(train_data, test_data):
    X_train = train_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    y_train = train_data['OS']
    X_test = test_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    return X_train, y_train, X_test
# Define `prepare_data` function to extract features and labels from training and testing datasets:
# - Drop irrelevant columns (e.g., `OS.time`, `index`, etc.).
# - Return training features, training labels, and testing features.



# def train_classifier(X_train, y_train):
#     model = SVC(kernel='rbf', C=1.0, probability=True)
#     model.fit(X_train, y_train)
#     return 
# Function to train SVM classifier with GridSearchCV for hyperparameter tuning

def train_classifier(X_train, y_train):
    # Define the hyperparameter grid for tuning the SVM model
    param_grid = {'C': [1], 'kernel': ['rbf'], 'probability': [True]}
    # - `C`: Regularization parameter (higher values reduce regularization).
    # - `kernel`: Specifies the kernel type, 'rbf' is the Radial Basis Function kernel.
    # - `probability`: Enables probability estimates for predictions.
    # Initialize the SVM model
    model = SVC() # Create an instance of the Support Vector Classifier (SVC) without predefined parameters.
    
    # Use GridSearchCV to find the best parameters
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    # - `model`: The SVM model to be tuned.
    # - `param_grid`: The grid of hyperparameters to search.
    # - `cv=5`: Perform 5-fold cross-validation.
    # - `scoring='accuracy'`: Use accuracy as the evaluation metric.
    # - `n_jobs=-1`: Use all available CPU cores for parallel processing.

    grid_search.fit(X_train, y_train)   # Fit the SVM model to the training data and perform hyperparameter tuning.
    
    # Output the best combination of hyperparameters found during the search
    print(f"Best parameters found: {grid_search.best_params_}")
    
    # Return the best model
    return grid_search.best_estimator_


# Function to predict and assign survival groups. This is the same as the previous one (RF)
def assign_survival_groups(model, train_data, test_data):
    # Combine train and test data
    combined_data = pd.concat([train_data, test_data], axis=0)
    X_combined = combined_data.drop(columns=['OS.time', 'OS', 'index','age_at_index','gender'])
    
    # Predict probabilities using the trained model
    probabilities = model.predict_proba(X_combined)[:, 1]
    
    # Calculate median probability
    median_prob = np.median(probabilities)
    
    # Assign survival groups based on the calculated median probability
    combined_data['Survival_Group'] = ['Short Survival' if p >= median_prob else 'Long Survival' for p in probabilities]
    
    # Split the combined data back into train and test data
    train_data['Survival_Group'] = combined_data.iloc[:len(train_data)]['Survival_Group']
    test_data['Survival_Group'] = combined_data.iloc[len(train_data):]['Survival_Group']
    
    return train_data, test_data

# Function to perform Kaplan-Meier analysis and plot results. This is the same as the previous one (RF)
def perform_km_analysis(test_data):
    plt.rcParams["font.family"] = "Times New Roman"  # Set font to Times New Roman
    plt.rcParams["font.size"] = 34  

    kmf_long = KaplanMeierFitter()
    kmf_short = KaplanMeierFitter()
    long_survival_data = test_data[test_data['Survival_Group'] == 'Long Survival']
    short_survival_data = test_data[test_data['Survival_Group'] == 'Short Survival']
    T_long, E_long = long_survival_data['OS.time'], long_survival_data['OS']
    T_short, E_short = short_survival_data['OS.time'], short_survival_data['OS']

    # Fit Cox Proportional Hazards model
    df = pd.DataFrame({
        'time': np.concatenate([T_long, T_short]),
        'event': np.concatenate([E_long, E_short]),
        'group': ['Long'] * len(T_long) + ['Short'] * len(T_short)
    })
    cph = CoxPHFitter()
    cph.fit(df, 'time', event_col='event', formula='group')
    hr = cph.summary.loc['group[T.Short]', 'exp(coef)']
    ci_lower = cph.summary.loc['group[T.Short]', 'exp(coef) lower 95%']
    ci_upper = cph.summary.loc['group[T.Short]', 'exp(coef) upper 95%']

    fig, ax = plt.subplots(figsize=(12,10))  # Increase figure size
    kmf_long.fit(T_long, E_long, label='Long Survival').plot(ax=ax)
    kmf_short.fit(T_short, E_short, label='Short Survival').plot(ax=ax)
    
    ax.grid(True, linestyle='--', linewidth=0.5)  # Add horizontal grid lines
    ax.minorticks_on()
    ax.grid(which='minor', linestyle=':', linewidth=0.5)  # Add minor grid lines

    add_at_risk_counts(kmf_long, kmf_short, ax=ax, labels=['Long Survival', 'Short Survival'], ypos=-0.5)  # Move risk table down
    
    results = logrank_test(T_long, T_short, event_observed_A=E_long, event_observed_B=E_short)
    
    plt.title(f'p-value={results.p_value:.3f}, HR={hr:.2f} [95% CI: {ci_lower:.2f}-{ci_upper:.2f}]')
    ax.set_ylabel('Survival Probability')  # Ensure Y-axis label is set
    ax.set_xlabel('Time (days)')  # Hide the X-axis label by setting it to an empty string
    plt.tight_layout()  # Adjust layout to make room for the risk table
    plt.subplots_adjust(bottom=0.2)  # Adjust the space at the bottom for the X-axis label
    plt.show()
    return results
def calculate_metrics(model, test_data):
    # Extract features and true labels. This is the same as the previous one (RF)
    X_test = test_data.drop(columns=['OS.time', 'OS', 'index', 'age_at_index', 'gender', 'Survival_Group'])
    T_test = test_data['OS.time']
    E_test = test_data['OS']
    
    # Predict probabilities for the test set
    predicted_probabilities = model.predict_proba(X_test)[:, 1]
    
    # Calculate c-index
    c_index = concordance_index(T_test, -predicted_probabilities, E_test)
    
    # Calculate Brier score
    brier_score = brier_score_loss(E_test, predicted_probabilities)
    
    print(f"c-index: {c_index:.3f}")
    print(f"Brier score: {brier_score:.3f}")
    return c_index, brier_score

# Main execution. This is the same as the previous one (RF)
try:
    train_data, test_data = load_data()
    X_train, y_train, X_test = prepare_data(train_data, test_data)
    model = train_classifier(X_train, y_train)
    train_data, test_data = assign_survival_groups(model, train_data, test_data)
    results = perform_km_analysis(test_data)
    # Calculate c-index and Brier score
    c_index, brier_score = calculate_metrics(model, test_data)
except Exception as e:
    print(f"An error occurred: {e}")
