In [None]:
import os
import csv
import array
import base64
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import copy
import pickle5
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, log_loss, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SequentialFeatureSelector
from imblearn.over_sampling import SMOTE
sm = SMOTE()

## Load dataset

In [None]:
data = pd.read_csv('/your_path/KSC_DB renew_20210612_brief.csv', encoding='CP949') # You should edit the file path

# Remove NaN rows
data = data.dropna(how = 'all').reset_index(drop = True)
# Select target patients
data = data.loc[data['Study_yes'] == 1].reset_index(drop=True)
# Make new ID as (DCC id)_(patient id)
data['ID'] = data["DCC_ID"].astype(int).astype(str) + '_' + data["patient_ID"].astype(int).astype(str) 

## If you want to use SMOTE for handling class imbalance problem, set this True

In [None]:
SMOTE_ON = False

## Exp setting

### - Exp 1 setting ( 0 vs (1/2/3/4) )
### - Exp 2 setting ( 0 vs (1/2/3/4/9) )

In [None]:
# If you want to use exp 2 setting, type 'exp2'
target_exp = 'exp1' 

In [None]:
if target_exp == 'exp1':
    data_target = data.loc[data['outcome'].isin([0,1,2,3,4])].reset_index(drop=True)
elif target_exp == 'exp2':
    data_target = data.loc[data['outcome'].isin([0,1,2,3,4,9])].reset_index(drop=True)
else:
    print('you should type exp1 or exp2 for target_exp')
    
data_target['outcome'] = data_target['outcome'].astype(int)
data_target_effective = copy.deepcopy(data_target)
data_effective = []
for i in data_target['outcome']:
    if i == 0:
        data_effective.append(0)
    else:
        data_effective.append(1)
data_target_effective['label'] = data_effective
data_target_effective_clinical = data_target_effective[['ID', 'label', 'sex', 'age','AF_duration', 'latest_AAD', 'New-CVASc', 'LVEF', 'LA', 'BMI']]
# Drop row if it has any NaN features
data_target_effective_clinical = data_target_effective_clinical.dropna().reset_index(drop=True)

### Number of patients by each label

In [None]:
print('label 0 : ' + str(len(data_target_effective_clinical.loc[data_target_effective_clinical['label'] == 0])))
print('label 1 : ' + str(len(data_target_effective_clinical.loc[data_target_effective_clinical['label'] == 1])))

## Logist regression using all features without feature pre-processing

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_target_effective_clinical.values[:,2:], 
                                                    data_target_effective_clinical['label'].values, 
                                                    test_size=0.2,
                                                    random_state=42)
if SMOTE_ON is True:
    X_train, y_train = sm.fit_resample(X_train, y_train)

model = LogisticRegression(max_iter=1000, penalty='none')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_tr = model.predict(X_train)
print('train_loss')
print(log_loss(y_train, (y_pred_tr > 0.5)*1.0))
print('test_loss')
print(log_loss(y_test, (y_pred > 0.5)*1.0))

tn, fp, fn, tp = confusion_matrix(y_test, (y_pred > 0.5)*1.0).ravel()
acc = (tp + tn) / (tp + fp + fn + tn) * 100
sen = tp / (tp + fn) * 100
spe = tn / (tn + fp) * 100
pr = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2*pr*recall) / (pr + recall)
y_pred = (y_pred > 0.5)*1.0

print('auroc : ' + str(roc_auc_score(y_test, y_pred)))
print('sen : ' + str(sen))
print('spe : ' + str(spe))
print('f1 : ' + str(f1))
print('auprc : ' + str(average_precision_score(y_test, y_pred)))
# print(str(roc_auc_score(y_test, y_pred)) + ',' + str(sen) + ',' + str(spe) + ',' + str(f1) + ',' + str(average_precision_score(y_test, y_pred)))

## Convert a continuous feature to categorical feature

In [None]:
### Age
data_target_effective_clinical['age'] = (data_target_effective_clinical['age'] >= 58)*1.0

# ### AF duration
# new_values = []
# for i in data_target_effective_clinical['AF_duration']:
#     if np.isnan(i):
#         new_values.append(99.0)
#     elif i >= 12:
#         new_values.append(1.0)
#     else:
#         new_values.append(0.0)
# data_target_effective_clinical['AF_duration'] = new_values

### latest AAD
new_values = []
for i in data_target_effective_clinical['latest_AAD']:
    if i == 5:
        new_values.append(1.0)
    else:
        new_values.append(0.0)
data_target_effective_clinical['latest_AAD'] = new_values

# ### hs-CRP
# new_values = []
# for i in data_target_effective_clinical['hs-CRP']:
#     if np.isnan(i):
#         new_values.append(99.0)
#     elif i >= 0.1:
#         new_values.append(1.0)
#     else:
#         new_values.append(0.0)
# data_target_effective_clinical['hs-CRP'] = new_values

### New-CVASc
new_values = []
for i in data_target_effective_clinical['New-CVASc']:
    if np.isnan(i):
        new_values.append(99.0)
    elif i >= 2.0:
        new_values.append(1.0)
    else:
        new_values.append(0.0)
data_target_effective_clinical['New-CVASc'] = new_values

# ### LVEF
# new_values = []
# for i in data_target_effective_clinical['LVEF']:
#     if np.isnan(i):
#         new_values.append(99.0)
#     elif i >= 40.0:
#         new_values.append(1.0)
#     else:
#         new_values.append(0.0)
# data_target_effective_clinical['LVEF'] = new_values

### LA
new_values = []
for i in data_target_effective_clinical['LA']:
    if i >= 50.0:
        new_values.append(1.0)
    else:
        new_values.append(0.0)
data_target_effective_clinical['LA'] = new_values

### BMI
new_values = []
for i in data_target_effective_clinical['BMI']:
    bmi = float(i)
    if bmi >= 28.0:
        new_values.append(1.0)
    else:
        new_values.append(0.0)
data_target_effective_clinical['BMI'] = new_values

## Logist regression using all features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_target_effective_clinical.values[:,2:], 
                                                    data_target_effective_clinical['label'].values, 
                                                    test_size=0.2,
                                                    random_state=42)
if SMOTE_ON is True:
    X_train, y_train = sm.fit_resample(X_train, y_train)

model = LogisticRegression(max_iter=1000, penalty='none')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_tr = model.predict(X_train)
print('train_loss')
print(log_loss(y_train, (y_pred_tr > 0.5)*1.0))
print('test_loss')
print(log_loss(y_test, (y_pred > 0.5)*1.0))

tn, fp, fn, tp = confusion_matrix(y_test, (y_pred > 0.5)*1.0).ravel()
acc = (tp + tn) / (tp + fp + fn + tn) * 100
sen = tp / (tp + fn) * 100
spe = tn / (tn + fp) * 100
pr = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2*pr*recall) / (pr + recall)
y_pred = (y_pred > 0.5)*1.0

print('auroc : ' + str(roc_auc_score(y_test, y_pred)))
print('sen : ' + str(sen))
print('spe : ' + str(spe))
print('f1 : ' + str(f1))
print('auprc : ' + str(average_precision_score(y_test, y_pred)))
# print(str(roc_auc_score(y_test, y_pred)) + ',' + str(sen) + ',' + str(spe) + ',' + str(f1) + ',' + str(average_precision_score(y_test, y_pred)))

## Feature selection

In [None]:
sfs = SequentialFeatureSelector(LogisticRegression(C=1000), n_features_to_select=3, direction='forward').fit(X_train,y_train)
data_target_effective_clinical.columns[2:][sfs.get_support()]

## Logistic regression using selected features

### they are 'age', 'LVEF', 'LA', 'AF_duration', 'latest_AAD' in the following cell

In [None]:
X_train, X_test, y_train, y_test = train_test_split(tt_target_effective_clinical[['age', 'LVEF', 'LA', 'AF_duration', 'latest_AAD']].values, 
                                                    tt_target_effective_clinical['label'].values, 
                                                    test_size=0.2,
                                                    random_state=42)
if SMOTE_ON is True:
    X_train, y_train = sm.fit_resample(X_train, y_train)

model = LogisticRegression(max_iter=1000, penalty='none')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
y_pred_tr = model.predict(X_train)
print('train_loss')
print(log_loss(y_train, (y_pred_tr > 0.5)*1.0))
print('test_loss')
print(log_loss(y_test, (y_pred > 0.5)*1.0))

tn, fp, fn, tp = confusion_matrix(y_test, (y_pred > 0.5)*1.0).ravel()
acc = (tp + tn) / (tp + fp + fn + tn) * 100
sen = tp / (tp + fn) * 100
spe = tn / (tn + fp) * 100
pr = tp / (tp + fp)
recall = tp / (tp + fn)
f1 = (2*pr*recall) / (pr + recall)
y_pred = (y_pred > 0.5)*1.0

print('auroc : ' + str(roc_auc_score(y_test, y_pred)))
print('sen : ' + str(sen))
print('spe : ' + str(spe))
print('f1 : ' + str(f1))
print('auprc : ' + str(average_precision_score(y_test, y_pred)))
# print(str(roc_auc_score(y_test, y_pred)) + ',' + str(sen) + ',' + str(spe) + ',' + str(f1) + ',' + str(average_precision_score(y_test, y_pred)))