In [None]:
# Bone marrow

In [None]:
In this machine learning project, we aim to predict the survival status of patients with bone marrow conditions using various health-related features from their medical data. The target variable, 'survival_status,' indicates whether a patient survived ('yes') or not ('no').

In [None]:
import pandas as pd
from scipy.io import arff
from scipy import stats
import numpy as np
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from scikeras.wrappers import KerasClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

In [None]:
# Load the ARFF file
path = 'bone-marrow.arff'
data = arff.loadarff(path)

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data[0])

# Convert byte strings to strings if necessary (common with ARFF files)
df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

# Save the DataFrame to a CSV file
csv_path = 'bone-marrow-converted.csv'
df.to_csv(csv_path, index=False)

print("ARFF file has been converted to CSV.")


ARFF file has been converted to CSV.


In [None]:
# Display the first few rows of the dataset to understand its structure
print(df.head())

  Recipientgender Stemcellsource   Donorage Donorage35 IIIV Gendermatch  \
0               1              1  22.830137          0    1           0   
1               1              0  23.342466          0    1           0   
2               1              0  26.394521          0    1           0   
3               0              0  39.684932          1    1           0   
4               0              1  33.358904          0    0           0   

  DonorABO RecipientABO RecipientRh ABOmatch  ... extcGvHD CD34kgx10d6  \
0        1            1           1        0  ...        1        7.20   
1       -1           -1           1        0  ...        1        4.50   
2       -1           -1           1        0  ...        1        7.94   
3        1            2           1        1  ...        ?        4.25   
4        1            2           0        1  ...        1       51.85   

    CD3dCD34 CD3dkgx10d8 Rbodymass ANCrecovery PLTrecovery  \
0   1.338760        5.38      35.0        

In [None]:
# Display summary statistics to get an idea of the data's distribution
print(df.describe())

         Donorage  Recipientage  CD34kgx10d6    CD3dCD34  CD3dkgx10d8  \
count  187.000000    187.000000   187.000000  182.000000   182.000000   
mean    33.472068      9.931551    11.891781    5.385096     4.745714   
std      8.271826      5.305639     9.914386    9.598716     3.859128   
min     18.646575      0.600000     0.790000    0.204132     0.040000   
25%     27.039726      5.050000     5.350000    1.786683     1.687500   
50%     33.550685      9.600000     9.720000    2.734462     4.325000   
75%     40.117809     14.050000    15.415000    5.823565     6.785000   
max     55.553425     20.200000    57.780000   99.560970    20.020000   

        Rbodymass     ANCrecovery     PLTrecovery  time_to_aGvHD_III_IV  \
count  185.000000      187.000000      187.000000            187.000000   
mean    35.801081    26752.866310    90937.919786         775408.042781   
std     19.650922   161747.200525   288242.407688         418425.252689   
min      6.000000        9.000000        9

In [None]:
# Identify non-numeric columns
non_numeric_columns = df.select_dtypes(exclude=[np.number]).columns
print("Non-numeric columns:", non_numeric_columns)

Non-numeric columns: Index(['Recipientgender', 'Stemcellsource', 'Donorage35', 'IIIV',
       'Gendermatch', 'DonorABO', 'RecipientABO', 'RecipientRh', 'ABOmatch',
       'CMVstatus', 'DonorCMV', 'RecipientCMV', 'Disease', 'Riskgroup',
       'Txpostrelapse', 'Diseasegroup', 'HLAmatch', 'HLAmismatch', 'Antigen',
       'Alel', 'HLAgrI', 'Recipientage10', 'Recipientageint', 'Relapse',
       'aGvHDIIIIV', 'extcGvHD'],
      dtype='object')


In [None]:
# Check the unique values for these non-numeric (integer-encoded) columns
for column in non_numeric_columns:
    print(f"{column}: Unique values: {df[column].unique()}")

Recipientgender: Unique values: ['1' '0']
Stemcellsource: Unique values: ['1' '0']
Donorage35: Unique values: ['0' '1']
IIIV: Unique values: ['1' '0']
Gendermatch: Unique values: ['0' '1']
DonorABO: Unique values: ['1' '-1' '2' '0']
RecipientABO: Unique values: ['1' '-1' '2' '0' '?']
RecipientRh: Unique values: ['1' '0' '?']
ABOmatch: Unique values: ['0' '1' '?']
CMVstatus: Unique values: ['3' '0' '2' '1' '?']
DonorCMV: Unique values: ['1' '0' '?']
RecipientCMV: Unique values: ['1' '0' '?']
Disease: Unique values: ['ALL' 'AML' 'chronic' 'nonmalignant' 'lymphoma']
Riskgroup: Unique values: ['1' '0']
Txpostrelapse: Unique values: ['0' '1']
Diseasegroup: Unique values: ['1' '0']
HLAmatch: Unique values: ['0' '1' '3' '2']
HLAmismatch: Unique values: ['0' '1']
Antigen: Unique values: ['-1' '1' '0' '2' '?']
Alel: Unique values: ['-1' '0' '2' '1' '3' '?']
HLAgrI: Unique values: ['0' '1' '7' '3' '2' '4' '5']
Recipientage10: Unique values: ['0' '1']
Recipientageint: Unique values: ['1' '0' '2']

In [None]:
# List of post-transplant features to exclude
post_transplant_features = [
    'IIIV', 'aGvHDIIIIV', 'extcGvHD', 'ANCrecovery', 'PLTrecovery',
    'time_to_aGvHD_III_IV', 'survival_time', 'Relapse'
]

# Drop post-transplant features
df = df.drop(columns=post_transplant_features)

# Check the remaining features
print(df.columns)

# Example: Display the first few rows of the pre-transplant dataset
print(df.head(10))


Index(['Recipientgender', 'Stemcellsource', 'Donorage', 'Donorage35',
       'Gendermatch', 'DonorABO', 'RecipientABO', 'RecipientRh', 'ABOmatch',
       'CMVstatus', 'DonorCMV', 'RecipientCMV', 'Disease', 'Riskgroup',
       'Txpostrelapse', 'Diseasegroup', 'HLAmatch', 'HLAmismatch', 'Antigen',
       'Alel', 'HLAgrI', 'Recipientage', 'Recipientage10', 'Recipientageint',
       'CD34kgx10d6', 'CD3dCD34', 'CD3dkgx10d8', 'Rbodymass',
       'survival_status'],
      dtype='object')
  Recipientgender Stemcellsource   Donorage Donorage35 Gendermatch DonorABO  \
0               1              1  22.830137          0           0        1   
1               1              0  23.342466          0           0       -1   
2               1              0  26.394521          0           0       -1   
3               0              0  39.684932          1           0        1   
4               0              1  33.358904          0           0        1   
5               1              0  27.391

In [None]:
# We also notice that some columns contain '?', which indicates missing data.
# We need to decide how to handle these. For simplicity, let's treat them as NaN and then impute.
df.replace('?', np.nan, inplace=True)

# Check for missing values after replacing '?' with NaN.
print(df.isnull().sum())

Recipientgender     0
Stemcellsource      0
Donorage            0
Donorage35          0
Gendermatch         0
DonorABO            0
RecipientABO        1
RecipientRh         2
ABOmatch            1
CMVstatus          16
DonorCMV            2
RecipientCMV       14
Disease             0
Riskgroup           0
Txpostrelapse       0
Diseasegroup        0
HLAmatch            0
HLAmismatch         0
Antigen             1
Alel                1
HLAgrI              0
Recipientage        0
Recipientage10      0
Recipientageint     0
CD34kgx10d6         0
CD3dCD34            5
CD3dkgx10d8         5
Rbodymass           2
survival_status     0
dtype: int64


In [None]:
# Prepare the dataset for KNN imputation
imputation_data = df[['CD34kgx10d6', 'CD3dkgx10d8', 'CD3dCD34']]

# Initialize the KNN imputer
knn_imputer = KNNImputer(n_neighbors=5)

# Perform the imputation
imputed_data = knn_imputer.fit_transform(imputation_data)

# Assign the imputed values back to the original dataset
df[['CD34kgx10d6', 'CD3dkgx10d8', 'CD3dCD34']] = imputed_data

In [None]:
# Drop rows with missing 'ABOmatch' values
df.dropna(subset=['ABOmatch','Antigen', 'Alel', 'RecipientRh','CMVstatus','RecipientABO', 'DonorABO'], inplace=True)

# Display the dataset to check the final state
print(df[['ABOmatch', 'survival_status','RecipientABO', 'DonorABO']].head(10))

   ABOmatch  survival_status RecipientABO DonorABO
0         0              0.0            1        1
1         0              1.0           -1       -1
2         0              1.0           -1       -1
3         1              1.0            2        1
4         1              0.0            2        1
7         1              1.0            1        0
8         1              0.0            0        2
9         1              0.0            0        1
10        1              0.0           -1        0
11        0              0.0            1        1


In [None]:
# Function to fill CMVstatus, DonorCMV, and RecipientCMV based on the given rules
def fill_cmv_status(row):
    # Fill CMVstatus
    if pd.notna(row['CMVstatus']):
        return row['CMVstatus'], row['DonorCMV'], row['RecipientCMV']

    if pd.notna(row['DonorCMV']) and pd.notna(row['RecipientCMV']):
        if row['DonorCMV'] == 0 and row['RecipientCMV'] == 0:
            return 0, row['DonorCMV'], row['RecipientCMV']
        elif row['DonorCMV'] == 1 and row['RecipientCMV'] == 0:
            return 1, row['DonorCMV'], row['RecipientCMV']
        elif row['DonorCMV'] == 0 and row['RecipientCMV'] == 1:
            return 2, row['DonorCMV'], row['RecipientCMV']
        elif row['DonorCMV'] == 1 and row['RecipientCMV'] == 1:
            return 3, row['DonorCMV'], row['RecipientCMV']

    if pd.isna(row['CMVstatus']):
        if row['survival_status'] == 0:
            if pd.isna(row['DonorCMV']):
                donor_cmv = 1 if row['RecipientCMV'] == 1 else 0
                return 1 if row['RecipientCMV'] == 1 else 3, donor_cmv, row['RecipientCMV']
            if pd.isna(row['RecipientCMV']):
                recipient_cmv = 0 if row['DonorCMV'] == 0 else 1
                return 2 if row['DonorCMV'] == 0 else 3, row['DonorCMV'], recipient_cmv
        else:
            if pd.isna(row['DonorCMV']):
                donor_cmv = 0 if row['RecipientCMV'] == 0 else 1
                return 0 if row['RecipientCMV'] == 0 else 2, donor_cmv, row['RecipientCMV']
            if pd.isna(row['RecipientCMV']):
                recipient_cmv = 0 if row['DonorCMV'] == 0 else 1
                return 0 if row['DonorCMV'] == 0 else 1, row['DonorCMV'], recipient_cmv

    return row['CMVstatus'], row['DonorCMV'], row['RecipientCMV']

# Apply the function to fill missing CMVstatus, DonorCMV, and RecipientCMV values
df[['CMVstatus', 'DonorCMV', 'RecipientCMV']] = df.apply(fill_cmv_status, axis=1, result_type='expand')

# Display the first few rows of the modified dataset
display_data = df[['CMVstatus', 'DonorCMV', 'RecipientCMV', 'survival_status']]
print(display_data.head(10))

   CMVstatus DonorCMV RecipientCMV  survival_status
0          3        1            1              0.0
1          0        0            0              1.0
2          2        0            1              1.0
3          1        1            0              1.0
4          0        0            1              0.0
7          1        1            0              1.0
8          2        0            1              0.0
9          2        0            1              0.0
10         1        1            0              0.0
11         0        0            0              0.0


In [None]:
import pandas as pd

# Calculate mean body mass for each specific age
age_means = df.groupby('Recipientage')['Rbodymass'].mean()

# Function to impute missing values with age-specific means
def impute_bodymass(row):
    if pd.isna(row['Rbodymass']):
        return age_means[row['Recipientage']]
    else:
        return row['Rbodymass']

# Apply the imputation function
df['Rbodymass'] = df.apply(impute_bodymass, axis=1)


# Drop any remaining rows with missing 'Rbodymass' values
df.dropna(subset=['Rbodymass'], inplace=True)


# Display the dataset to check the imputed values
print(df[['Rbodymass', 'Recipientage']].head(10))



    Rbodymass  Recipientage
0        35.0           9.6
1        20.6           4.0
2        23.4           6.6
3        50.0          18.1
4         9.0           1.3
7        56.0          18.2
8        20.5           7.9
9        16.5           4.7
10       10.5           1.9
11       47.0          13.4


In [None]:
# One-hot encode the 'Disease' column
df = pd.get_dummies(df, columns=['Disease'], drop_first=False)

# Display the dataset to check the one-hot encoded 'Disease' column
print(df.head(10))

   Recipientgender Stemcellsource   Donorage Donorage35 Gendermatch DonorABO  \
0                1              1  22.830137          0           0        1   
1                1              0  23.342466          0           0       -1   
2                1              0  26.394521          0           0       -1   
3                0              0  39.684932          1           0        1   
4                0              1  33.358904          0           0        1   
7                1              0  21.435616          0           0        0   
8                1              1  32.641096          0           0        2   
9                1              1  28.783562          0           1        1   
10               0              1  29.731507          0           0        0   
11               0              1  36.800000          1           0        1   

   RecipientABO RecipientRh ABOmatch CMVstatus  ... CD34kgx10d6   CD3dCD34  \
0             1           1        0     

In [None]:
# Check for missing values after imputing them
print(df.isnull().sum())

Recipientgender         0
Stemcellsource          0
Donorage                0
Donorage35              0
Gendermatch             0
DonorABO                0
RecipientABO            0
RecipientRh             0
ABOmatch                0
CMVstatus               0
DonorCMV                0
RecipientCMV            0
Riskgroup               0
Txpostrelapse           0
Diseasegroup            0
HLAmatch                0
HLAmismatch             0
Antigen                 0
Alel                    0
HLAgrI                  0
Recipientage            0
Recipientage10          0
Recipientageint         0
CD34kgx10d6             0
CD3dCD34                0
CD3dkgx10d8             0
Rbodymass               0
survival_status         0
Disease_ALL             0
Disease_AML             0
Disease_chronic         0
Disease_lymphoma        0
Disease_nonmalignant    0
dtype: int64


In [None]:
# Assume 'survival_status' is the target variable and it's binary
y = df['survival_status']  # target variable
X = df.drop('survival_status', axis=1)  # features

# Ensure all input features for chi2 are non-negative
# Chi-Square input needs to be non-negative. If not, use MinMaxScaler to scale them
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
# Apply SelectKBest class to extract top k best features using Chi-Square
k = 11  # Number of features to select
bestfeatures = SelectKBest(score_func=chi2, k=k)
fit = bestfeatures.fit(X, y)

# Get the scores for each feature
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(X.columns)

# Concatenate the two dataframes for better visualization
featureScores = pd.concat([dfcolumns, dfscores], axis=1)
featureScores.columns = ['Feature', 'Score']  # Naming the dataframe columns
print(featureScores.nlargest(k, 'Score'))  # Print k best features


                 Feature     Score
30      Disease_lymphoma  9.109589
13         Txpostrelapse  3.002626
12             Riskgroup  2.384081
21        Recipientage10  1.395818
26             Rbodymass  1.081810
25           CD3dkgx10d8  0.958306
31  Disease_nonmalignant  0.949551
23           CD34kgx10d6  0.841769
3             Donorage35  0.813180
20          Recipientage  0.747882
7            RecipientRh  0.673072


In [None]:

# Selecting top features
features = ['Recipientgender', 'Stemcellsource', 'Donorage', 'Donorage35',
       'Gendermatch', 'DonorABO', 'RecipientABO', 'RecipientRh', 'ABOmatch',
       'CMVstatus', 'DonorCMV', 'RecipientCMV', 'Disease_lymphoma', 'Riskgroup',
       'Txpostrelapse', 'Diseasegroup', 'HLAmatch', 'HLAmismatch', 'Antigen',
       'Alel', 'HLAgrI', 'Recipientage', 'Recipientage10', 'Recipientageint',
       'CD34kgx10d6', 'CD3dCD34', 'CD3dkgx10d8', 'Rbodymass','Disease_ALL' , 'Disease_AML' , 'Disease_chronic' , 'Disease_nonmalignant'
       ]
top_features=[ 'Disease_lymphoma', 'Riskgroup',
       'Txpostrelapse','Recipientage10'
       ]



In [None]:

# Prepare the data
X = df[top_features]
y = df['survival_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models
def create_model():
    model = Sequential([
        Dense(12, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

models = {
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "NB": GaussianNB(),
    "SVM": SVC(probability=True),
    "ANN": KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0)
}

# Perform cross-validation on the training set
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    if name == "ANN":
        # Keras models do not directly support cross_val_score
        train_accuracies = []
        test_accuracies = []
        for train_index, val_index in skf.split(X_train_scaled, y_train):
            X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)
            train_accuracy = model.score(X_train_fold, y_train_fold)
            y_val_pred = model.predict(X_val_fold)
            test_accuracy = accuracy_score(y_val_fold, (y_val_pred > 0.5).astype(int))

            train_accuracies.append(train_accuracy)
            test_accuracies.append(test_accuracy)

        avg_train_accuracy = sum(train_accuracies) / len(train_accuracies)
        avg_test_accuracy = sum(test_accuracies) / len(test_accuracies)

    else:
        train_accuracies = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
        avg_train_accuracy = train_accuracies.mean()
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        avg_test_accuracy = accuracy_score(y_test, y_pred)

    # Generate classification report
    report = classification_report(y_test, y_pred)
    print(f"Classification Report for {name}:")
    print(report)

    # Print accuracies
    print(f"Average Training Accuracy for {name}: {avg_train_accuracy:.4f}")
    print(f"Testing Accuracy for {name}: {avg_test_accuracy:.4f}")
    print("------------------------------------------------\n")


Classification Report for DT:
              precision    recall  f1-score   support

         0.0       0.60      0.47      0.53        19
         1.0       0.47      0.60      0.53        15

    accuracy                           0.53        34
   macro avg       0.54      0.54      0.53        34
weighted avg       0.54      0.53      0.53        34

Average Training Accuracy for DT: 0.5900
Testing Accuracy for DT: 0.5294
------------------------------------------------

Classification Report for RF:
              precision    recall  f1-score   support

         0.0       0.60      0.47      0.53        19
         1.0       0.47      0.60      0.53        15

    accuracy                           0.53        34
   macro avg       0.54      0.54      0.53        34
weighted avg       0.54      0.53      0.53        34

Average Training Accuracy for RF: 0.5826
Testing Accuracy for RF: 0.5294
------------------------------------------------

Classification Report for KNN:
         

  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Classification Report for ANN:
              precision    recall  f1-score   support

         0.0       0.60      0.47      0.53        19
         1.0       0.47      0.60      0.53        15

    accuracy                           0.53        34
   macro avg       0.54      0.54      0.53        34
weighted avg       0.54      0.53      0.53        34

Average Training Accuracy for ANN: 0.6567
Testing Accuracy for ANN: 0.5974
------------------------------------------------



In [None]:
import itertools
features = [ 'Disease_lymphoma', 'Riskgroup',
       'Txpostrelapse','Recipientage10','Rbodymass','CD3dkgx10d8','Disease_nonmalignant'
       ]


# Define the models
def create_model(input_shape):
    model = Sequential([
        Dense(12, activation='relu', input_shape=(input_shape,)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Function to evaluate a given set of features
def evaluate_features(feature_set):
    X = df[list(feature_set)]
    y = df['survival_status']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Perform cross-validation on the training set
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    models = {
        "DT": DecisionTreeClassifier(),
        "RF": RandomForestClassifier(),
        "KNN": KNeighborsClassifier(),
        "NB": GaussianNB(),
        "SVM": SVC(probability=True),
        "ANN": KerasClassifier(model=create_model, model__input_shape=X_train_scaled.shape[1], epochs=100, batch_size=10, verbose=0)
    }

    results = {}

    for name, model in models.items():
        if name == "ANN":
            # Keras models do not directly support cross_val_score
            train_accuracies = []
            test_accuracies = []
            for train_index, val_index in skf.split(X_train_scaled, y_train):
                X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
                y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                model = KerasClassifier(model=create_model, model__input_shape=X_train_fold.shape[1], epochs=100, batch_size=10, verbose=0)
                model.fit(X_train_fold, y_train_fold)
                train_accuracy = model.score(X_train_fold, y_train_fold)
                y_val_pred = model.predict(X_val_fold)
                test_accuracy = accuracy_score(y_val_fold, (y_val_pred > 0.5).astype(int))

                train_accuracies.append(train_accuracy)
                test_accuracies.append(test_accuracy)

            avg_train_accuracy = np.mean(train_accuracies)
            avg_test_accuracy = np.mean(test_accuracies)

        else:
            train_accuracies = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
            avg_train_accuracy = train_accuracies.mean()
            model.fit(X_train_scaled, y_train)
            y_pred = model.predict(X_test_scaled)
            avg_test_accuracy = accuracy_score(y_test, y_pred)

        # Generate classification report
        report = classification_report(y_test, y_pred)
        results[name] = {
            "train_accuracy": avg_train_accuracy,
            "test_accuracy": avg_test_accuracy,
            "report": report
        }

    return results

# Generate all combinations of features (limiting to 3 to 5 features for manageability)
all_combinations = []
for r in range(3, 6):
    combinations = list(itertools.combinations(features, r))
    all_combinations.extend(combinations)

best_results = {}

# Evaluate each combination
for feature_set in all_combinations:
    print(f"Evaluating feature set: {feature_set}")
    results = evaluate_features(feature_set)

    for model_name, result in results.items():
        if model_name not in best_results or result['test_accuracy'] > best_results[model_name]['test_accuracy']:
            best_results[model_name] = {
                "features": feature_set,
                "train_accuracy": result['train_accuracy'],
                "test_accuracy": result['test_accuracy'],
                "report": result['report']
            }

# Print the best results for each model
for model_name, result in best_results.items():
    print(f"Best feature set for {model_name}: {result['features']}")
    print(f"Average Training Accuracy: {result['train_accuracy']:.4f}")
    print(f"Testing Accuracy: {result['test_accuracy']:.4f}")
    print(f"Classification Report:\n{result['report']}")
    print("------------------------------------------------\n")


Evaluating feature set: ('Disease_lymphoma', 'Riskgroup', 'Txpostrelapse')


NameError: name 'df' is not defined

In [53]:
selected_features = ['Disease_lymphoma', 'Txpostrelapse', 'Riskgroup']

# Prepare the data
X = df[selected_features]
y = df['survival_status']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the models
def create_model():
    model = Sequential([
        Dense(12, activation='relu', input_shape=(X_train_scaled.shape[1],)),
        Dense(8, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

models = {
    "DT": DecisionTreeClassifier(),
    "RF": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "NB": GaussianNB(),
    "SVM": SVC(probability=True),
    "ANN": KerasClassifier(build_fn=create_model, epochs=100, batch_size=10, verbose=0)
}

# Perform cross-validation on the training set
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    if name == "ANN":
        # Keras models do not directly support cross_val_score
        train_accuracies = []
        test_accuracies = []
        for train_index, val_index in skf.split(X_train_scaled, y_train):
            X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

            model.fit(X_train_fold, y_train_fold)
            train_accuracy = model.score(X_train_fold, y_train_fold)
            y_val_pred = model.predict(X_val_fold)
            test_accuracy = accuracy_score(y_val_fold, (y_val_pred > 0.5).astype(int))

            train_accuracies.append(train_accuracy)
            test_accuracies.append(test_accuracy)

        avg_train_accuracy = sum(train_accuracies) / len(train_accuracies)
        avg_test_accuracy = sum(test_accuracies) / len(test_accuracies)

    else:
        train_accuracies = cross_val_score(model, X_train_scaled, y_train, cv=skf, scoring='accuracy')
        avg_train_accuracy = train_accuracies.mean()
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        avg_test_accuracy = accuracy_score(y_test, y_pred)

    # Generate classification report
    report = classification_report(y_test, y_pred)
    print(f"Classification Report for {name}:")
    print(report)

    # Print accuracies
    print(f"Average Training Accuracy for {name}: {avg_train_accuracy:.4f}")
    print(f"Testing Accuracy for {name}: {avg_test_accuracy:.4f}")
    print("------------------------------------------------\n")


Classification Report for DT:
              precision    recall  f1-score   support

         0.0       0.63      0.89      0.74        19
         1.0       0.71      0.33      0.45        15

    accuracy                           0.65        34
   macro avg       0.67      0.61      0.60        34
weighted avg       0.67      0.65      0.61        34

Average Training Accuracy for DT: 0.5969
Testing Accuracy for DT: 0.6471
------------------------------------------------

Classification Report for RF:
              precision    recall  f1-score   support

         0.0       0.63      0.89      0.74        19
         1.0       0.71      0.33      0.45        15

    accuracy                           0.65        34
   macro avg       0.67      0.61      0.60        34
weighted avg       0.67      0.65      0.61        34

Average Training Accuracy for RF: 0.5977
Testing Accuracy for RF: 0.6471
------------------------------------------------

Classification Report for KNN:
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  X, y = self._initialize(X, y)


Classification Report for SVM:
              precision    recall  f1-score   support

         0.0       0.63      0.89      0.74        19
         1.0       0.71      0.33      0.45        15

    accuracy                           0.65        34
   macro avg       0.67      0.61      0.60        34
weighted avg       0.67      0.65      0.61        34

Average Training Accuracy for SVM: 0.5524
Testing Accuracy for SVM: 0.6471
------------------------------------------------



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Classification Report for ANN:
              precision    recall  f1-score   support

         0.0       0.63      0.89      0.74        19
         1.0       0.71      0.33      0.45        15

    accuracy                           0.65        34
   macro avg       0.67      0.61      0.60        34
weighted avg       0.67      0.65      0.61        34

Average Training Accuracy for ANN: 0.6194
Testing Accuracy for ANN: 0.5977
------------------------------------------------



In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

# Define the parameter grid with valid options
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10,20,30],
    'min_samples_leaf': [1, 2, 4,5,6]
}

# Initialize the Random Forest model
rf = RandomForestClassifier()

# Perform Randomized Search
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, cv=3, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train_scaled, y_train)

print("Best parameters found: ", random_search.best_params_)
print("Best cross-validation score: ", random_search.best_score_)

# Evaluate the best model on the train and test sets
best_rf = random_search.best_estimator_
y_train_pred = best_rf.predict(X_train_scaled)
y_test_pred = best_rf.predict(X_test_scaled)

print("Classification Report for Random Forest (Train):")
print(classification_report(y_train, y_train_pred))
print(f"Train Accuracy for Random Forest: {accuracy_score(y_train, y_train_pred):.4f}")
print("------------------------------------------------\n")

print("Classification Report for Random Forest (Test):")
print(classification_report(y_test, y_test_pred))
print(f"Test Accuracy for Random Forest: {accuracy_score(y_test, y_test_pred):.4f}")
print("------------------------------------------------\n")


Fitting 3 folds for each of 100 candidates, totalling 300 fits


  pid = os.fork()


In [None]:


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# Create the pipeline for SVM
svm_pipeline = Pipeline([
    ('smote', smote),
    ('scaler', StandardScaler()),  # Add scaler to pipeline
    ('classifier', SVC(probability=True))
])

# Define the parameter grid for SVM
svm_param_grid = {
    'classifier__C': [0.1, 1, 10, 100],
    'classifier__gamma': [1, 0.1, 0.01, 0.001],
    'classifier__kernel': ['linear', 'rbf']
}

# Perform Randomized Search with the SVM pipeline
svm_random_search = RandomizedSearchCV(estimator=svm_pipeline, param_distributions=svm_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
svm_random_search.fit(X_train, y_train)  # Use original X_train

print("Best parameters found for SVM: ", svm_random_search.best_params_)
print("Best cross-validation score for SVM: ", svm_random_search.best_score_)

# Evaluate the best SVM model on the train and test set
best_svm_pipeline = svm_random_search.best_estimator_
y_svm_train_pred = best_svm_pipeline.predict(X_train)
y_svm_test_pred = best_svm_pipeline.predict(X_test)

print("Classification Report for SVM (Train):")
print(classification_report(y_train, y_svm_train_pred))
print(f"Train Accuracy for SVM: {accuracy_score(y_train, y_svm_train_pred):.4f}")
print("------------------------------------------------\n")

print("Classification Report for SVM (Test):")
print(classification_report(y_test, y_svm_test_pred))
print(f"Test Accuracy for SVM: {accuracy_score(y_test, y_svm_test_pred):.4f}")
print("------------------------------------------------\n")

### KNN Model with SMOTE
# Create the pipeline for KNN
knn_pipeline = Pipeline([
    ('smote', smote),
    ('scaler', StandardScaler()),  # Add scaler to pipeline
    ('classifier', KNeighborsClassifier())
])

# Define the parameter grid for KNN
knn_param_grid = {
    'classifier__n_neighbors': [3, 5, 7, 9],
    'classifier__weights': ['uniform', 'distance'],
    'classifier__metric': ['euclidean', 'manhattan']
}

# Perform Randomized Search with the KNN pipeline
knn_random_search = RandomizedSearchCV(estimator=knn_pipeline, param_distributions=knn_param_grid, n_iter=20, cv=3, verbose=2, random_state=42, n_jobs=-1)
knn_random_search.fit(X_train, y_train)  # Use original X_train

print("Best parameters found for KNN: ", knn_random_search.best_params_)
print("Best cross-validation score for KNN: ", knn_random_search.best_score_)

# Evaluate the best KNN model on the train and test set
best_knn_pipeline = knn_random_search.best_estimator_
y_knn_train_pred = best_knn_pipeline.predict(X_train)
y_knn_test_pred = best_knn_pipeline.predict(X_test)

print("Classification Report for KNN (Train):")
print(classification_report(y_train, y_knn_train_pred))
print(f"Train Accuracy for KNN: {accuracy_score(y_train, y_knn_train_pred):.4f}")
print("------------------------------------------------\n")

print("Classification Report for KNN (Test):")
print(classification_report(y_test, y_knn_test_pred))
print(f"Test Accuracy for KNN: {accuracy_score(y_test, y_knn_test_pred):.4f}")
print("------------------------------------------------\n")

# Summary of results for all models
models = {
    "SVM": best_svm_pipeline,
    "KNN": best_knn_pipeline
}

for name, model in models.items():
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    print(f"Classification Report for {name} (Train):")
    print(classification_report(y_train, y_train_pred))
    print(f"Train Accuracy for {name}: {accuracy_score(y_train, y_train_pred):.4f}")
    print("------------------------------------------------\n")
    print(f"Classification Report for {name} (Test):")
    print(classification_report(y_test, y_test_pred))
    print(f"Test Accuracy for {name}: {accuracy_score(y_test, y_test_pred):.4f}")
    print("------------------------------------------------\n")
