Logistic Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'logistic_regression'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

selected_features = select_features(X, y, LogisticRegression(max_iter=10000), 8)
X_selected = X[selected_features]

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']
}
logistic_regression = LogisticRegression(max_iter=10000)
grid_search = GridSearchCV(estimator=logistic_regression, param_grid=param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_features_list = selected_features.tolist()

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(best_features_list, columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + best_features_list]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = best_model.predict(X_selected)
y_prob = best_model.predict_proba(X_selected)[:, 1]

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
auc = roc_auc_score(y, y_prob)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, best_features_list, best_model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[best_features_list]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = best_model.predict(X_subset)
    y_prob_subset = best_model.predict_proba(X_subset)[:, 1]
    
    accuracy = accuracy_score(y_subset, y_pred_subset)
    precision = precision_score(y_subset, y_pred_subset)
    recall = recall_score(y_subset, y_pred_subset)
    f1 = f1_score(y_subset, y_pred_subset)
    auc = roc_auc_score(y_subset, y_prob_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, best_features_list, best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, best_features_list, best_model, model_name):
    X_external = external_data[best_features_list]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = best_model.predict(X_external)
    y_prob_external = best_model.predict_proba(X_external)[:, 1]
    
    accuracy = accuracy_score(y_external, y_pred_external)
    precision = precision_score(y_external, y_pred_external)
    recall = recall_score(y_external, y_pred_external)
    f1 = f1_score(y_external, y_pred_external)
    auc = roc_auc_score(y_external, y_prob_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, best_features_list, best_model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, best_features_list, best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, best_features_list, best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

### Support Vector Machine

In [49]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'svm'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

selected_features = select_features(X, y, SVC(kernel='linear'), 8)
X_selected = X[selected_features]

param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf']
}
svm = SVC(probability=True)
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_features_list = selected_features.tolist()

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(best_features_list, columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + best_features_list]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = best_model.predict(X_selected)
y_prob = best_model.predict_proba(X_selected)[:, 1]

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
auc = roc_auc_score(y, y_prob)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, best_features_list, best_model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[best_features_list]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = best_model.predict(X_subset)
    y_prob_subset = best_model.predict_proba(X_subset)[:, 1]
    
    accuracy = accuracy_score(y_subset, y_pred_subset)
    precision = precision_score(y_subset, y_pred_subset)
    recall = recall_score(y_subset, y_pred_subset)
    f1 = f1_score(y_subset, y_pred_subset)
    auc = roc_auc_score(y_subset, y_prob_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, best_features_list, best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, best_features_list, best_model, model_name):
    X_external = external_data[best_features_list]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = best_model.predict(X_external)
    y_prob_external = best_model.predict_proba(X_external)[:, 1]
    
    accuracy = accuracy_score(y_external, y_pred_external)
    precision = precision_score(y_external, y_pred_external)
    recall = recall_score(y_external, y_pred_external)
    f1 = f1_score(y_external, y_pred_external)
    auc = roc_auc_score(y_external, y_prob_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, best_features_list, best_model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, best_features_list, best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, best_features_list, best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

K-Nearest Neighbors

In [50]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'knn'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

logistic_model = LogisticRegression(max_iter=1000)
selected_features = select_features(X, y, logistic_model, 8)
X_selected = X[selected_features]

param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}
knn = KNeighborsClassifier()
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_features_list = selected_features.tolist()

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(best_features_list, columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + best_features_list]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = best_model.predict(X_selected)
y_prob = best_model.predict_proba(X_selected)[:, 1]

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
auc = roc_auc_score(y, y_prob)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, best_features_list, best_model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[best_features_list]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = best_model.predict(X_subset)
    y_prob_subset = best_model.predict_proba(X_subset)[:, 1]
    
    accuracy = accuracy_score(y_subset, y_pred_subset)
    precision = precision_score(y_subset, y_pred_subset)
    recall = recall_score(y_subset, y_pred_subset)
    f1 = f1_score(y_subset, y_pred_subset)
    auc = roc_auc_score(y_subset, y_prob_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, best_features_list, best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, best_features_list, best_model, model_name):
    X_external = external_data[best_features_list]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = best_model.predict(X_external)
    y_prob_external = best_model.predict_proba(X_external)[:, 1]
    
    accuracy = accuracy_score(y_external, y_pred_external)
    precision = precision_score(y_external, y_pred_external)
    recall = recall_score(y_external, y_pred_external)
    f1 = f1_score(y_external, y_pred_external)
    auc = roc_auc_score(y_external, y_prob_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, best_features_list, best_model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, best_features_list, best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, best_features_list, best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Naive Bayes

In [None]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'naive_bayes'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

logistic_model = LogisticRegression(max_iter=1000)
selected_features = select_features(X, y, logistic_model, 8)
X_selected = X[selected_features]

param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}
naive_bayes = GaussianNB()
grid_search = GridSearchCV(estimator=naive_bayes, param_grid=param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_features_list = selected_features.tolist()

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(best_features_list, columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + best_features_list]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = best_model.predict(X_selected)
y_prob = best_model.predict_proba(X_selected)[:, 1]

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
auc = roc_auc_score(y, y_prob)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, best_features_list, best_model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[best_features_list]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = best_model.predict(X_subset)
    y_prob_subset = best_model.predict_proba(X_subset)[:, 1]
    
    accuracy = accuracy_score(y_subset, y_pred_subset)
    precision = precision_score(y_subset, y_pred_subset)
    recall = recall_score(y_subset, y_pred_subset)
    f1 = f1_score(y_subset, y_pred_subset)
    auc = roc_auc_score(y_subset, y_prob_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, best_features_list, best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, best_features_list, best_model, model_name):
    X_external = external_data[best_features_list]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = best_model.predict(X_external)
    y_prob_external = best_model.predict_proba(X_external)[:, 1]
    
    accuracy = accuracy_score(y_external, y_pred_external)
    precision = precision_score(y_external, y_pred_external)
    recall = recall_score(y_external, y_pred_external)
    f1 = f1_score(y_external, y_pred_external)
    auc = roc_auc_score(y_external, y_prob_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, best_features_list, best_model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, best_features_list, best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, best_features_list, best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Linear Discriminant Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'lda'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

selected_features = select_features(X, y, LinearDiscriminantAnalysis(), 8)
X_selected = X[selected_features]

param_grid = {
    'solver': ['svd', 'lsqr', 'eigen'],
    'shrinkage': [None, 'auto'] + list(np.linspace(0, 1, 20))
}
lda = LinearDiscriminantAnalysis()
grid_search = GridSearchCV(estimator=lda, param_grid=param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_features_list = selected_features.tolist()

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(best_features_list, columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + best_features_list]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = best_model.predict(X_selected)
y_prob = best_model.predict_proba(X_selected)[:, 1]

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
auc = roc_auc_score(y, y_prob)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, best_features_list, best_model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[best_features_list]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = best_model.predict(X_subset)
    y_prob_subset = best_model.predict_proba(X_subset)[:, 1]
    
    accuracy = accuracy_score(y_subset, y_pred_subset)
    precision = precision_score(y_subset, y_pred_subset)
    recall = recall_score(y_subset, y_pred_subset)
    f1 = f1_score(y_subset, y_pred_subset)
    auc = roc_auc_score(y_subset, y_prob_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, best_features_list, best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, best_features_list, best_model, model_name):
    X_external = external_data[best_features_list]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = best_model.predict(X_external)
    y_prob_external = best_model.predict_proba(X_external)[:, 1]
    
    accuracy = accuracy_score(y_external, y_pred_external)
    precision = precision_score(y_external, y_pred_external)
    recall = recall_score(y_external, y_pred_external)
    f1 = f1_score(y_external, y_pred_external)
    auc = roc_auc_score(y_external, y_prob_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, best_features_list, best_model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, best_features_list, best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, best_features_list, best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Quadratic Discriminant Analysis

In [None]:
import pandas as pd
import numpy as np
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'qda'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

logistic_model = LogisticRegression(max_iter=1000)
selected_features = select_features(X, y, logistic_model, 8)
X_selected = X[selected_features]

param_grid = {
    'reg_param': np.linspace(0, 1, 20)
}
qda = QuadraticDiscriminantAnalysis()
grid_search = GridSearchCV(estimator=qda, param_grid=param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_features_list = selected_features.tolist()

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(best_features_list, columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + best_features_list]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = best_model.predict(X_selected)
y_prob = best_model.predict_proba(X_selected)[:, 1]

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
auc = roc_auc_score(y, y_prob)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, best_features_list, best_model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[best_features_list]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = best_model.predict(X_subset)
    y_prob_subset = best_model.predict_proba(X_subset)[:, 1]
    
    accuracy = accuracy_score(y_subset, y_pred_subset)
    precision = precision_score(y_subset, y_pred_subset)
    recall = recall_score(y_subset, y_pred_subset)
    f1 = f1_score(y_subset, y_pred_subset)
    auc = roc_auc_score(y_subset, y_prob_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, best_features_list, best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, best_features_list, best_model, model_name):
    X_external = external_data[best_features_list]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = best_model.predict(X_external)
    y_prob_external = best_model.predict_proba(X_external)[:, 1]
    
    accuracy = accuracy_score(y_external, y_pred_external)
    precision = precision_score(y_external, y_pred_external)
    recall = recall_score(y_external, y_pred_external)
    f1 = f1_score(y_external, y_pred_external)
    auc = roc_auc_score(y_external, y_prob_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, best_features_list, best_model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, best_features_list, best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, best_features_list, best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

K-Means

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'kmeans'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

logistic_model = LogisticRegression(max_iter=1000)
selected_features = select_features(X, y, logistic_model, 8)
X_selected = X[selected_features]

param_grid = {
    'n_clusters': [2, 3, 4, 5, 6, 7, 8, 9, 10],
    'init': ['k-means++', 'random'],
    'n_init': [10, 20, 30],
    'max_iter': [300, 600, 900]
}
kmeans = KMeans()
grid_search = GridSearchCV(estimator=kmeans, param_grid=param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_features_list = selected_features.tolist()

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(best_features_list, columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + best_features_list]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = best_model.predict(X_selected)
y_prob = best_model.transform(X_selected).min(axis=1)

accuracy = accuracy_score(y, y_pred)
precision = precision_score(y, y_pred)
recall = recall_score(y, y_pred)
f1 = f1_score(y, y_pred)
auc = roc_auc_score(y, y_prob)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, best_features_list, best_model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[best_features_list]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = best_model.predict(X_subset)
    y_prob_subset = best_model.transform(X_subset).min(axis=1)
    
    accuracy = accuracy_score(y_subset, y_pred_subset)
    precision = precision_score(y_subset, y_pred_subset)
    recall = recall_score(y_subset, y_pred_subset)
    f1 = f1_score(y_subset, y_pred_subset)
    auc = roc_auc_score(y_subset, y_prob_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, best_features_list, best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, best_features_list, best_model, model_name):
    X_external = external_data[best_features_list]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = best_model.predict(X_external)
    y_prob_external = best_model.transform(X_external).min(axis=1)
    
    accuracy = accuracy_score(y_external, y_pred_external)
    precision = precision_score(y_external, y_pred_external)
    recall = recall_score(y_external, y_pred_external)
    f1 = f1_score(y_external, y_pred_external)
    auc = roc_auc_score(y_external, y_prob_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, best_features_list, best_model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, best_features_list, best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, best_features_list, best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Linear Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'linear_regression'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

linear_model = LinearRegression()
selected_features = select_features(X, y, linear_model, 8)
X_selected = X[selected_features]

param_grid = {
    'fit_intercept': [True, False],
    'copy_X': [True, False]
}
grid_search = GridSearchCV(estimator=linear_model, param_grid=param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_features_list = selected_features.tolist()

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(best_features_list, columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + best_features_list]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, best_features_list, best_model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[best_features_list]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(best_model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, best_features_list, best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, best_features_list, best_model, model_name):
    X_external = external_data[best_features_list]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = best_model.predict(X_external)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, best_features_list, best_model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, best_features_list, best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, best_features_list, best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Ridge Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'ridge_regression'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

ridge_model = Ridge()
selected_features = select_features(X, y, ridge_model, 8)
X_selected = X[selected_features]

param_grid = {
    'alpha': [0.1, 1.0, 10.0, 100.0]
}
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, cv=10, scoring='roc_auc', n_jobs=-1)
grid_search.fit(X_selected, y)

best_params = grid_search.best_params_
best_model = grid_search.best_estimator_
best_features_list = selected_features.tolist()

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(best_features_list, columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + best_features_list]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, best_features_list, best_model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[best_features_list]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(best_model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, best_features_list, best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, best_features_list, best_model, model_name):
    X_external = external_data[best_features_list]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = best_model.predict(X_external)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, best_features_list, best_model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, best_features_list, best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, best_features_list, best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Multilayer Perceptron

In [None]:
import pandas as pd
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'mlp_classifier'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    selector = RFE(estimator=model, n_features_to_select=n_features, step=1)
    selector.fit(X, y)
    return X.columns[selector.get_support(indices=True)]

logistic_model = LogisticRegression(max_iter=1000)
selected_features = select_features(X, y, logistic_model, 8)
X_selected = X[selected_features]

param_grid = {
    'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}

mlp_model = MLPClassifier(max_iter=1000)
grid_search = GridSearchCV(estimator=mlp_model, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Decision Tree

In [None]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'decision_tree_classifier'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    model.fit(X, y)
    selector = SelectFromModel(model, max_features=n_features, prefit=True)
    return X.columns[selector.get_support(indices=True)]

decision_tree_model = DecisionTreeClassifier()
selected_features = select_features(X, y, decision_tree_model, 8)
X_selected = X[selected_features]

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}

grid_search = GridSearchCV(estimator=decision_tree_model, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Random Forest

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'random_forest_classifier'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    model.fit(X, y)
    selector = SelectFromModel(model, max_features=n_features, prefit=True)
    return X.columns[selector.get_support(indices=True)]

random_forest_model = RandomForestClassifier()
selected_features = select_features(X, y, random_forest_model, 8)
X_selected = X[selected_features]

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=random_forest_model, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Gradient Boosting

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'gradient_boosting_classifier'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    model.fit(X, y)
    selector = SelectFromModel(model, max_features=n_features, prefit=True)
    return X.columns[selector.get_support(indices=True)]

gradient_boosting_model = GradientBoostingClassifier()
selected_features = select_features(X, y, gradient_boosting_model, 8)
X_selected = X[selected_features]

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=gradient_boosting_model, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

AdaBoost

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'adaboost_classifier'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    model.fit(X, y)
    selector = SelectFromModel(model, max_features=n_features, prefit=True)
    return X.columns[selector.get_support(indices=True)]

base_estimator = DecisionTreeClassifier(max_depth=1)
adaboost_model = AdaBoostClassifier(estimator=base_estimator)
selected_features = select_features(X, y, adaboost_model, 8)
X_selected = X[selected_features]

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'estimator__max_depth': [1, 2, 3]
}

grid_search = GridSearchCV(estimator=adaboost_model, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

### 2.2.5 XGBoost

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'xgboost_classifier'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    model.fit(X, y)
    selector = SelectFromModel(model, max_features=n_features, prefit=True)
    return X.columns[selector.get_support(indices=True)]

xgboost_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
selected_features = select_features(X, y, xgboost_model, 8)
X_selected = X[selected_features]

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=xgboost_model, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

LightGBM

In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'lightgbm_classifier'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    model.fit(X, y)
    selector = SelectFromModel(model, max_features=n_features, prefit=True)
    return X.columns[selector.get_support(indices=True)]

lightgbm_model = lgb.LGBMClassifier()
selected_features = select_features(X, y, lightgbm_model, 8)
X_selected = X[selected_features]

param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'num_leaves': [31, 50, 100],
    'max_depth': [-1, 10, 20],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=lightgbm_model, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

CatBoost

In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'catboost_classifier'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    model.fit(X, y)
    selector = SelectFromModel(model, max_features=n_features, prefit=True)
    return X.columns[selector.get_support(indices=True)]

catboost_model = CatBoostClassifier(verbose=0)
selected_features = select_features(X, y, catboost_model, 8)
X_selected = X[selected_features]

param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5],
    'border_count': [32, 64, 128]
}

grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Lasso Regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'lasso_regression'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    model.fit(X, y)
    selector = SelectFromModel(model, max_features=n_features, prefit=True)
    return X.columns[selector.get_support(indices=True)]

lasso = LassoCV(cv=10)
selected_features = select_features(X, y, lasso, 8)
X_selected = X[selected_features]

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1'],
    'solver': ['liblinear']
}

logistic = LogisticRegression()
grid_search = GridSearchCV(estimator=logistic, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

Elastic Net

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import ElasticNetCV, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import GridSearchCV, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from joblib import Parallel, delayed
import joblib

model_name = 'elastic_net'

data = pd.read_csv('..')
external_data1 = pd.read_csv('..')
external_data2 = pd.read_csv('..')

X = data.drop(columns=['NID', 'time1', 'IIR_3revi'])
y = data['IIR_3revi']

def select_features(X, y, model, n_features):
    model.fit(X, y)
    selector = SelectFromModel(model, max_features=n_features, prefit=True)
    return X.columns[selector.get_support(indices=True)]

elastic_net = ElasticNetCV(cv=10)
selected_features = select_features(X, y, elastic_net, 8)
X_selected = X[selected_features]

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.1, 0.5, 0.7, 0.9]
}

logistic = LogisticRegression()
grid_search = GridSearchCV(estimator=logistic, param_grid=param_grid, cv=10, n_jobs=8)
grid_search.fit(X_selected, y)
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

pd.DataFrame([best_params]).to_csv(f'{model_name}_best_params.csv', index=False)
pd.DataFrame(list(selected_features), columns=['Best Features']).to_csv(f'{model_name}_best_features.csv', index=False)

best_model_data = data[['NID', 'time1', 'IIR_3revi'] + list(selected_features)]
best_model_data.to_csv(f'{model_name}_best_model_data.csv', index=False)

y_pred = cross_val_predict(best_model, X_selected, y, cv=10)
y_pred_binary = (y_pred > 0.5).astype(int)

accuracy = accuracy_score(y, y_pred_binary)
precision = precision_score(y, y_pred_binary)
recall = recall_score(y, y_pred_binary)
f1 = f1_score(y, y_pred_binary)
auc = roc_auc_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred_binary).ravel()
specificity = tn / (tn + fp)
sensitivity = recall

overall_metrics = {
    'Accuracy': accuracy,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'AUC': auc,
    'Specificity': specificity,
    'Sensitivity': sensitivity
}
pd.DataFrame([overall_metrics]).to_csv(f'{model_name}_overall_metrics.csv', index=False)

def calculate_metrics_at_time(time_point, data, selected_features, model):
    subset = data[data['time1'] > time_point]
    X_subset = subset[selected_features]
    y_subset = subset['IIR_3revi']
    
    y_pred_subset = cross_val_predict(model, X_subset, y_subset, cv=10)
    y_pred_binary_subset = (y_pred_subset > 0.5).astype(int)
    
    accuracy = accuracy_score(y_subset, y_pred_binary_subset)
    precision = precision_score(y_subset, y_pred_binary_subset)
    recall = recall_score(y_subset, y_pred_binary_subset)
    f1 = f1_score(y_subset, y_pred_binary_subset)
    auc = roc_auc_score(y_subset, y_pred_subset)
    tn, fp, fn, tp = confusion_matrix(y_subset, y_pred_binary_subset).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    return {
        'Time Point': time_point,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }

time_points = [5, 6, 7]
metrics_at_times = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, data, list(selected_features), best_model) for tp in time_points)
pd.DataFrame(metrics_at_times).to_csv(f'{model_name}_metrics_at_times.csv', index=False)

def evaluate_external_data(external_data, selected_features, model, model_name):
    X_external = external_data[selected_features]
    y_external = external_data['IIR_3revi']
    
    y_pred_external = cross_val_predict(model, X_external, y_external, cv=10)
    y_pred_binary_external = (y_pred_external > 0.5).astype(int)
    
    accuracy = accuracy_score(y_external, y_pred_binary_external)
    precision = precision_score(y_external, y_pred_binary_external)
    recall = recall_score(y_external, y_pred_binary_external)
    f1 = f1_score(y_external, y_pred_binary_external)
    auc = roc_auc_score(y_external, y_pred_external)
    tn, fp, fn, tp = confusion_matrix(y_external, y_pred_binary_external).ravel()
    specificity = tn / (tn + fp)
    sensitivity = recall
    
    overall_metrics_external = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'AUC': auc,
        'Specificity': specificity,
        'Sensitivity': sensitivity
    }
    pd.DataFrame([overall_metrics_external]).to_csv(f'{model_name}_external_overall_metrics.csv', index=False)
    
    metrics_at_times_external = Parallel(n_jobs=8)(delayed(calculate_metrics_at_time)(tp, external_data, list(selected_features), model) for tp in time_points)
    pd.DataFrame(metrics_at_times_external).to_csv(f'{model_name}_external_metrics_at_times.csv', index=False)

evaluate_external_data(external_data1, list(selected_features), best_model, f'{model_name}_ditan')
evaluate_external_data(external_data2, list(selected_features), best_model, f'{model_name}_youan')

joblib.dump(best_model, f'{model_name}_best_model.joblib')

In [None]:
# To alter the designated feature, use the following code for all algorithms:
selected_features = ['B9', 'B8', 'B12_group', 'B18_group', 'B10_group', 'AGE_10', 'B16_group', 'B2', 
                     'B25', 'B19_group']
X = data[selected_features]
y = data['IIR_3revi']