<a href="https://colab.research.google.com/github/EmaanBashir/PCR-AND-RFS-PREDICTION-ON-BREAST-CANCER-DATASET/blob/main/FSAnovaKfold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
def data_cleaning_for_regression(df):

    # dropping PCR  for Classification

    df = df.drop('pCR (outcome)',axis=1)

    # Replacing missing values

    df=df.replace(999, np.nan)
    imputer = SimpleImputer(strategy='median')

    df_imputed=imputer.fit_transform(df)

    df_imputed = pd.DataFrame(df_imputed, columns=df.columns)

    # Dealing with Categorical variables

    #  Age,ChemoGrade,Proliferation and TumorStage are ordinal categorical variables so we do not change them

    # ER, PGR, HER2, TrippleNegative, HistologyType and LNStatus are either binary or categorical variables with 2 different
    # values. We just convert HistologyType to binary format.

    df_imputed['HistologyType'] = df_imputed['HistologyType'].map({1: 0, 2: 1})

    return df_imputed

In [None]:
# Reading the Data and dropping 'ID'
df=pd.read_excel('TrainDataset2023.xls')
df=df.drop('ID',axis=1)

df_regression =  data_cleaning_for_regression(df)
print(df_regression.columns.tolist())

y = df_regression['RelapseFreeSurvival (outcome)']
X = df_regression.drop('RelapseFreeSurvival (outcome)', axis=1)

# Identify categorical and numerical columns
categorical_cols = ['Age', 'ChemoGrade', 'Proliferation', 'TumorStage', 'ER', 'PgR', 'HER2', 'TrippleNegative', 'HistologyType', 'LNStatus']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

# Normalize the numerical features
scaler = StandardScaler()
X_normalized = X.copy()
# X_normalized[numerical_cols] = scaler.fit_transform(X[numerical_cols])

['RelapseFreeSurvival (outcome)', 'Age', 'ER', 'PgR', 'HER2', 'TrippleNegative', 'ChemoGrade', 'Proliferation', 'HistologyType', 'LNStatus', 'TumourStage', 'original_shape_Elongation', 'original_shape_Flatness', 'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength', 'original_shape_Maximum2DDiameterColumn', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterSlice', 'original_shape_Maximum3DDiameter', 'original_shape_MeshVolume', 'original_shape_MinorAxisLength', 'original_shape_Sphericity', 'original_shape_SurfaceArea', 'original_shape_SurfaceVolumeRatio', 'original_shape_VoxelVolume', 'original_firstorder_10Percentile', 'original_firstorder_90Percentile', 'original_firstorder_Energy', 'original_firstorder_Entropy', 'original_firstorder_InterquartileRange', 'original_firstorder_Kurtosis', 'original_firstorder_Maximum', 'original_firstorder_MeanAbsoluteDeviation', 'original_firstorder_Mean', 'original_firstorder_Median', 'original_firstorder_Minimum', '

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np

In [None]:
def select_features_with_anova(X, y, model, feature_range, n_folds):
    """
    Performs feature selection using ANOVA and cross-validation.

    Parameters:
    - X: Feature matrix.
    - y: Target variable.
    - model: Scikit-learn model for evaluation.
    - feature_range: Tuple specifying the range of feature counts to try (inclusive).
    - n_folds: Number of folds for cross-validation.

    Returns:
    - List of names of the selected features.
    """

    # Store the average cross-validation scores for each feature count
    cv_scores = []

    # Loop through the specified range of features
    for k in range(feature_range[0], feature_range[1] + 1):
        # Apply SelectKBest
        selector = SelectKBest(f_classif, k=k)
        X_new = selector.fit_transform(X, y)

        # Perform cross-validation
        scores = cross_val_score(model, X_new, y, cv=n_folds, scoring='r2')
        cv_scores.append(scores.mean())

    # Find the number of features for the best average CV score
    optimal_k = np.argmax(cv_scores) + feature_range[0]
    print(f"Optimal number of features: {optimal_k}")

    # Re-fit the selector with the optimal number of features
    final_selector = SelectKBest(f_classif, k=optimal_k).fit(X, y)

    # Get the boolean mask of the selected features
    selected_features_mask = final_selector.get_support()

    # Extract and return the feature names
    selected_features_names = X.columns[selected_features_mask]
    return selected_features_names.tolist()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

In [None]:
# Random Forest Classifier
rf_model = RandomForestRegressor(max_depth = 5, min_samples_split = 2, n_estimators = 100, criterion = 'absolute_error', random_state=0)  # Adjust parameters as needed
selected_features_rf = select_features_with_anova(X_normalized, y, rf_model, (10, 50), 3)
print("Selected feature names with Random Forest:", selected_features_rf)
print("Number of features selected:", len(selected_features_rf))


Optimal number of features: 42
Selected feature names with Random Forest: ['ER', 'HistologyType', 'original_shape_SurfaceVolumeRatio', 'original_firstorder_Minimum', 'original_glcm_Autocorrelation', 'original_glcm_Correlation', 'original_glcm_DifferenceVariance', 'original_glcm_JointAverage', 'original_glcm_MCC', 'original_glcm_SumAverage', 'original_gldm_DependenceEntropy', 'original_gldm_DependenceNonUniformityNormalized', 'original_gldm_DependenceVariance', 'original_gldm_HighGrayLevelEmphasis', 'original_gldm_LargeDependenceEmphasis', 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'original_gldm_LargeDependenceLowGrayLevelEmphasis', 'original_gldm_LowGrayLevelEmphasis', 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'original_glrlm_GrayLevelNonUniformityNormalized', 'original_glrlm_GrayLevelVariance', 'original_glrlm_HighGrayLevelRunEmphasis', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_glrlm_LowGrayLevelRunEmphasis', 'original_glrlm_RunEntropy', 'original

In [None]:
# Support Vector Classifier
xgb_model = XGBRegressor(objective ='reg:linear', colsample_bytree = 0.5, learning_rate = 0.01,
            max_depth = 5, alpha = 5, n_estimators = 200, random_state = 0)
selected_features_svc = select_features_with_anova(X_normalized, y, xgb_model, (10, 50), 3)
print("Selected feature names with XGB:", selected_features_svc)
print("Number of features selected:", len(selected_features_svc))




Optimal number of features: 30
Selected feature names with XGB: ['HistologyType', 'original_shape_SurfaceVolumeRatio', 'original_firstorder_Minimum', 'original_glcm_Autocorrelation', 'original_glcm_Correlation', 'original_glcm_JointAverage', 'original_glcm_MCC', 'original_glcm_SumAverage', 'original_gldm_DependenceNonUniformityNormalized', 'original_gldm_HighGrayLevelEmphasis', 'original_gldm_LargeDependenceEmphasis', 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'original_gldm_LargeDependenceLowGrayLevelEmphasis', 'original_gldm_LowGrayLevelEmphasis', 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'original_glrlm_HighGrayLevelRunEmphasis', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_glrlm_LowGrayLevelRunEmphasis', 'original_glrlm_RunEntropy', 'original_glrlm_RunLengthNonUniformityNormalized', 'original_glrlm_RunPercentage', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_glszm_GrayLevelNonUniformityNormalized', 'original_glszm_GrayLevelVariance', '