In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, make_scorer, balanced_accuracy_score
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import EditedNearestNeighbours
import pickle



url = 'https://raw.githubusercontent.com/ArvinCorotana/ML/main/TrainDataset2023.xls'
dataset = pd.read_excel(url, sheet_name='Sheet1')
dataset.replace(999, np.nan, inplace=True)
dataset = dataset.dropna(subset=['pCR (outcome)', 'RelapseFreeSurvival (outcome)'], how='any')





def data_split(dataset):
    dataset.drop('ID',axis=1,inplace=True)
    X = dataset.iloc[:, 2:]
    y = dataset.iloc[:, :2]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)
    data = pd.concat((y_test,X_test),axis=1)
    data = data.dropna(how='any')
    X_test = data.iloc[:, 2:]
    y_test = data.iloc[:,:2]
    X_train = X_train.reset_index(drop=True)
    y_train = y_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)
    y_test = y_test.reset_index(drop=True)
    return X_train, X_test, y_train, y_test


def imputation(X_train):
    imputer = SimpleImputer(strategy='most_frequent')
    imputer.fit(X_train)
    X_train_imputed = imputer.transform(X_train)
    X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns, index=X_train.index)
    with open('imputer.pkl', 'wb') as file:
        pickle.dump(imputer, file)
    return X_train_imputed

def outlier(mri_features):

    mri_features_copy = mri_features.copy()
    Q1 = mri_features_copy.quantile(0.25)
    Q3 = mri_features_copy.quantile(0.75)
    IQR = Q3 - Q1
    lower_lim = Q1 - (1.5 * IQR)
    upper_lim = Q3 + (1.5 * IQR)


    for column in mri_features_copy:
        outliers = ((mri_features_copy[column] < lower_lim[column]) |
                    (mri_features_copy[column] > upper_lim[column]))


        mri_features_copy.loc[outliers, column] = mri_features_copy.loc[outliers, column].apply(lambda x: lower_lim[column] if x < lower_lim[column] else upper_lim[column])

    return mri_features_copy

def target(y_train,y_test):
    y_train = y_train.drop('RelapseFreeSurvival (outcome)', axis=1)
    y_train = y_train.squeeze()
    y_test = y_test.drop('RelapseFreeSurvival (outcome)', axis=1)
    y_test = y_test.squeeze()
    return y_train,y_test

def oversampling(X_train,y_train):
    sm = SMOTE(sampling_strategy='minority', k_neighbors=5, random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    return X_train,y_train


def normalization(X_train,X_test_imputed):
    scaler = MinMaxScaler()
    scaler.fit(X_train)
    X_train_scaled = scaler.transform(X_train)
    X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
    X_test_scaled = scaler.transform(X_test_imputed)
    X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_imputed.columns)
    with open('minmax.pkl', 'wb') as file:
        pickle.dump(scaler, file)
    return X_train,X_test_imputed


def feature_selection(X_train_new,X_test_new):
    correlation_matrix = X_train_new.corr()
    correlation_matrix = abs(correlation_matrix)
    mask = np.triu(np.ones(correlation_matrix.shape), k=1)
    correlation_features = correlation_matrix[correlation_matrix > 0.85]
    columns_to_drop = [column for column in correlation_features.columns if (correlation_features[column] > 0).sum() > 3]
    X_train_new = X_train_new.drop(columns=columns_to_drop)
    X_test_new = X_test_new.drop(columns=columns_to_drop)
    columns = X_train.columns.tolist()
    with open('features.pkl', 'wb') as file:
        pickle.dump(columns, file)
    return X_train_new,X_test_new


def data_processing(dataset):
    X_train, X_test, y_train, y_test = data_split(dataset)
    X_train = imputation(X_train)
    y_train,y_test = target(y_train,y_test)
    X_train,y_train = oversampling(X_train,y_train)
    X_train_clinical = X_train.iloc[:,:10]
    X_test_clinical = X_test.iloc[:,:10]
    X_train_mri = X_train.iloc[:,10:]
    X_test_mri = X_test.iloc[:,10:]
    X_train_mri = outlier(X_train_mri)
    X_train_mri,X_test_mri = normalization(X_train_mri,X_test_mri)
    X_train_mri,X_test_mri = feature_selection(X_train_mri,X_test_mri)
    X_train = pd.concat([X_train_clinical, X_train_mri], axis=1)
    X_test= pd.concat([X_test_clinical, X_test_mri], axis=1)

    return X_train, X_test, y_train, y_test



def MLP(X_train, X_test, y_train, y_test):
    print('mlp')
    mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=1000, random_state=42)
    param_grid = {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'invscaling', 'adaptive']
    }
    grid_search = GridSearchCV(mlp, param_grid, scoring='balanced_accuracy', cv=5)
    grid_search.fit(X_train, y_train)
    best_mlp = grid_search.best_estimator_
    with open('best_mlp_model.pkl', 'wb') as file:
        pickle.dump(grid_search, file)    
    print("Best parameters found during GridSearchCV:", grid_search.best_params_)
    y_pred = best_mlp.predict(X_test)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    print(f'Balanced Accuracy on test set: {balanced_accuracy:.4f}')
    cv_scores = cross_val_score(best_mlp, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='balanced_accuracy')
    print("Cross-Validation Accuracy Score for each fold:", cv_scores)
    print(f"Mean CV Accuracy for the best MLP model: {cv_scores.mean():.2%}")




def svc(X_train, X_test, y_train, y_test):
    print('SVC')
    svm = SVC()
    svm.fit(X_train, y_train)
    y_pred_default = svm.predict(X_test)
    print("Default SVM Accuracy:", svm.score(X_test, y_test))
    param_grid = {'C': [0.01, 1, 10],
                  'gamma': [0.01, 1, 'scale'],
                  'kernel': ['linear', 'rbf']}
    grid_search = GridSearchCV(svm, param_grid, cv=StratifiedKFold(n_splits=5), scoring='balanced_accuracy')
    grid_search.fit(X_train, y_train)
    final_svm = grid_search.best_estimator_
    with open('best_svm_model.pkl', 'wb') as file:
        pickle.dump(final_svm, file)
    accuracy = final_svm.score(X_test, y_test)
    print(f"Model Accuracy on test set: {accuracy: .2%}")
    y_pred = final_svm.predict(X_test)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    print(f"Balanced Accuracy for the best SVM model: {balanced_accuracy:.2%}")
    cv_scores = cross_val_score(final_svm, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='balanced_accuracy')
    print("Cross-Validation Accuracy Score for each fold:", cv_scores)
    print(f"Mean CV Accuracy for the best SVM model: {cv_scores.mean():.2%}")
    accuracys = accuracy_score(y_test, y_pred)
    precisions = precision_score(y_test, y_pred)
    recalls = recall_score(y_test, y_pred)
    print("Accuracy:", accuracys)
    print("Precision:", precisions)
    print("Recall:", recalls)
    print(confusion_matrix(y_test, y_pred))



def logistic_regression(X_train, X_test, y_train, y_test):
    print('lr')
    logreg = LogisticRegression(random_state=42)

    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga']
    }

    grid_searchlr = GridSearchCV(logreg, param_grid, scoring='balanced_accuracy', cv=5)
    grid_searchlr.fit(X_train, y_train)

    best_logreg = grid_searchlr.best_estimator_

    print("Best parameters found during GridSearchCV:", grid_searchlr.best_params_)
    with open('best_lr_model.pkl', 'wb') as file:
        pickle.dump(grid_searchlr, file)

    cv_scores = cross_val_score(best_logreg, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='balanced_accuracy')
    print("Cross-Validation Accuracy Score for each fold:", cv_scores)
    print(f"Mean CV Accuracy for the best SVM model: {cv_scores.mean():.2%}")
    y_pred = best_logreg.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy on test set: {accuracy:.4f}')
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    print(f'Balanced Accuracy on test set: {balanced_accuracy:.4f}')

def decision_Tree(X_train, X_test, y_train, y_test):
    print('DT')
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)

    accuracy = dt.score(X_test, y_test)
    print(f"Default Model Accuracy on test set: {accuracy: .2%}")

    param_grid = {
        'criterion': ['gini', 'entropy'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 5, 10, 15, 20, 25, 30],
        'min_samples_split': [2, 5, 10, 15, 20],
        'min_samples_leaf': [1, 2, 4, 8, 12],
        'max_features': ['sqrt', 'log2', None],
        'class_weight': [None, 'balanced']
    }
    cv = StratifiedKFold(n_splits=3)

    grid_searchd = GridSearchCV(dt, param_grid, scoring='balanced_accuracy', cv=cv)
    grid_searchd.fit(X_train, y_train)

    final_dt = grid_searchd.best_estimator_
    accuracy = final_dt.score(X_test, y_test)
    print(f"Model Accuracy on test set: {accuracy: .2%}")

    with open('best_dt_model.pkl', 'wb') as file:
        pickle.dump(grid_searchd, file)

    y_pred = final_dt.predict(X_test)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    print(f'Balanced Accuracy: {balanced_accuracy:.4f}')

def RandomForest(X_train, X_test, y_train, y_test):
    print('Random Forest')
    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    print(f'Model Accuracy = {rf.score(X_test, y_test): .2%}')
    param_grid = {'n_estimators': [50, 100, 150],
              'max_depth': [None, 10, 20],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': ['sqrt', 'log2', None],
              'bootstrap': [True, False],
              'criterion': ['gini', 'entropy']}

    grid_searchrf = GridSearchCV(rf, param_grid, cv=5, scoring='balanced_accuracy')
    grid_searchrf.fit(X_train, y_train) #GirdCV taking a long time
    grid_searchrf.score(X_test, y_test)
    final_rf = grid_searchrf.best_estimator_
    accuracyrf = final_rf.score(X_test, y_test)
    print(f"Model Accuracy on test set ={accuracyrf: .2%}")
    y_pred = final_rf.predict(X_test)

    with open('best_rf_model.pkl', 'wb') as file:
        pickle.dump(grid_searchrf, file)

    accuracyr = accuracy_score(y_test, y_pred)
    precisionr = precision_score(y_test, y_pred)
    recallr = recall_score(y_test, y_pred)
    balance = balanced_accuracy_score(y_test,y_pred)

    print("Accuracy:", accuracyr)
    print("Precision:", precisionr)#precision and recall not working
    print("Recall:", recallr)
    print("Balanced:", balance)
    print(confusion_matrix(y_test, y_pred))

X_train, X_test, y_train, y_test = data_processing(dataset)





In [None]:
MLP(X_train, X_test, y_train, y_test)
logistic_regression(X_train, X_test, y_train, y_test)
RandomForest(X_train, X_test, y_train, y_test)
decision_Tree(X_train, X_test, y_train, y_test)
svc(X_train, X_test, y_train, y_test)
