In [None]:
import os
import sys
import warnings

import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import statsmodels.api as sm
from ecgdetectors import Detectors
from petastorm import make_reader
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import auc, classification_report, confusion_matrix
from sklearn.model_selection import (RepeatedStratifiedKFold, cross_val_score,
                                     train_test_split)

warnings.simplefilter(action='ignore', category=FutureWarning)
sys.path.append(os.path.join(os.getcwd(), ".."))
import shared_utils.utils_data as utils_data
from Metrics.Wrapper_main_function import compute_metrics, save_metrics_to_xarray

path_formatted_glasgow = "/workspaces/maitrise/data/20221006_physio_quality/set-a/dataParquet"
path_petastorm = f"file:///{path_formatted_glasgow}"

In [None]:
save_path = "/workspaces/maitrise/results"
name_method = ["Corr_interlead","Corr_intralead","wPMF","SNRECG","HR","Kurtosis","Flatline","TSD"]

if not os.path.exists(save_path):
    os.makedirs(save_path)

if not "quality_metrics.nc" in os.listdir(save_path):
    print("Computing metrics")
    if not "ecg_data.nc" in os.listdir(save_path):
        ds_data = utils_data.format_data_to_xarray(path_petastorm, save_path)
    else:
        ds_data = xr.load_dataset(os.path.join(save_path,"ecg_data.nc"))

    ds_metrics = save_metrics_to_xarray(ds_data, name_method, save_path, verbose = True)
else:
    ds_metrics = xr.load_dataset(os.path.join(save_path,"quality_metrics.nc"))

In [None]:
###Save summary table into a folder 
def save_table(path_data,summary,name_folder):
    tab1 = "Results_logit"
    tab2 = "Coefficient_results"
    path_to_folder = path_data + {}.format(name_folder),
    if not os.path.isdir(path_to_folder):
        os.mkdir(path_to_folder),

    for i,t in zip(range(0,2),[tab1,tab2]):
        summary.tables[i].to_csv(path_to_folder + {}.csv.format(t))

In [None]:
ds_filtered = ds_metrics.where(ds_metrics.data_quality != "unlabeled").dropna(dim = "id")

np_metrics = ds_filtered.quality_metrics.values
metrics_names = ds_filtered.metric_name.values.tolist()
np_label = ds_filtered.data_quality.values
np_label[np_label == "acceptable" ] = 1
np_label[np_label == "unacceptable" ] = 0
np_label = np_label.astype(int)

In [None]:
X = np_metrics.mean(axis = 1)
df_X = pd.DataFrame(X, columns =metrics_names )
df_y = pd.DataFrame(np_label, columns = ["y"])

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X,os_data_y=smote.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

In [None]:
###We will fit a Logistic model on the "SMOTED" train dataset
logit_model=sm.Logit(os_data_y,os_data_X)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"all_features_SMOTE")

In [None]:
##Without taking into account class imbalanced : 
logit_model = sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"all_features")

In [None]:
###Same idea but with the statistically significant features
cols = ["Corr_interlead","SNRECG","TSD","HR","wPMF"]
X_new = os_data_X[cols]

logit_model = sm.Logit(os_data_y,X_new)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Stat_significant_feature_SMOTE")

In [None]:
##Without taking into account class imbalanced :
X_new_1 = X[cols] 
logit_model = sm.Logit(y,X_new_1)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Stat_significant_feature")

In [None]:
##Same without TSD
cols = ["Corr_interlead","SNRECG","HR","wPMF"]
X_new_new = os_data_X[cols]

logit_model = sm.Logit(os_data_y,X_new_new)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Stat_significant_feature_noTSD_SMOTE")

In [None]:
##Without taking into account class imbalanced :
X_new_2 = X[cols] 
logit_model = sm.Logit(y,X_new_2)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Stat_significant_feature_noTSD")

In [None]:
##Same without HR
cols = ["Corr_interlead","SNRECG","TSD","wPMF"]
X_new_new = os_data_X[cols]

logit_model = sm.Logit(os_data_y,X_new_new)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Stat_significant_feature_noHR_SMOTE")

In [None]:
##Without taking into account class imbalanced :
X_new_2 = X[cols] 
logit_model = sm.Logit(y,X_new_2)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Stat_significant_feature_noHR")

In [None]:
##Same wihtout TSD and HR
cols = ["Corr_interlead","SNRECG"]
X_new_new = os_data_X[cols]

logit_model = sm.Logit(os_data_y,X_new_new)
result=logit_model.fit()
print(result.summary2())
summary2 = result.summary2()
save_table(save_path,result.summary2(),"Stat_significant_feature_noTSDHR_SMOTE")

In [None]:
##Without taking into account class imbalanced :
X_new_2 = X[cols] 
logit_model = sm.Logit(y,X_new_2)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Stat_significant_feature_noTSDHR")

In [None]:
#replace TSD by intracorrelation lead with SMOTE
cols = ["Corr_interlead","SNRECG","Corr_intralead"]
X_new_new = os_data_X[cols]

logit_model = sm.Logit(os_data_y,X_new_new)
result=logit_model.fit()
print(result.summary2())

In [None]:
##Without taking into account class imbalanced :
X_new_2 = X[cols] 
logit_model = sm.Logit(y,X_new_2)
result=logit_model.fit()
print(result.summary2())

In [None]:
####We will train Logistic regression model on SMOTED dataset and used the test dataset from the initial dataset (so Imbalanced)

X_train, X_test, y_train, y_test_balanced = train_test_split(X, y.values.ravel(), test_size=0.3, random_state=0)

columns = X_train.columns

os_data_X,os_data_y=smote.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
cols = ["Corr_interlead","SNRECG","TSD"]
os_data_X = os_data_X[cols]


logreg_balanced = LogisticRegression()
logreg_balanced.fit(os_data_X, os_data_y)

x_test_balanced = pd.DataFrame(data = X_test,columns = columns)
x_test_balanced = x_test_balanced[cols].to_numpy()
y_pred_balanced = logreg_balanced.predict(x_test_balanced)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg_balanced.score(x_test_balanced, y_test_balanced)))

cm = confusion_matrix(y_test_balanced, y_pred_balanced)
print(cm)

print(classification_report(y_test_balanced, y_pred_balanced))

In [None]:
##Now We train on the Umbalanced dataset:
X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), test_size=0.3, random_state=0)

columns = X_train.columns

cols = ["Corr_interlead","SNRECG","TSD","HR","Corr_intralead"]
os_data_X = X[cols]


logreg = LogisticRegression()
logreg.fit(os_data_X, y)

x_test = pd.DataFrame(data = X_test,columns = columns)
x_test = x_test[cols].to_numpy()
y_pred = logreg.predict(x_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(x_test, y_test)))
cm = confusion_matrix(y_test, y_pred)
print(cm)

print(classification_report(y_test, y_pred))

In [None]:
##For balanced dataset
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test_balanced, logreg_balanced.predict(x_test_balanced))
fpr, tpr, thresholds = roc_curve(y_test_balanced, logreg_balanced.predict_proba(x_test_balanced)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic using SMOTED dataset')
plt.legend(loc="lower right")
plt.show()

In [None]:
##For imbalanced dataset:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(x_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(x_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic without using SMOTE')
plt.legend(loc="lower right")
plt.show()

In [None]:
#Balanced dataset
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test_balanced, logreg_balanced.predict_proba(x_test)[:,1])
logit_roc_auc = auc(recall,precision)

plt.figure()
plt.plot(recall, precision, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 0],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve with SMOTED dataset')
plt.legend(loc="lower right")
#plt.savefig('Log_PR')
plt.show()

In [None]:
##imbalanced dataset
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, logreg.predict_proba(x_test)[:,1])
logit_roc_auc = auc(recall,precision)

plt.figure()
plt.plot(recall, precision, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 0],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve without using SMOTE')
plt.legend(loc="lower right")
#plt.savefig('Log_PR')
plt.show()

In [None]:
cv = RepeatedStratifiedKFold(n_splits = 15,n_repeats = 20,random_state = 0)
model = LogisticRegression()
scores = cross_val_score(model,X,y.values.ravel(),scoring='f1', cv=cv, n_jobs=-1,)
print('F1-score: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
def evaluate_model(X, y, repeats):
	# prepare the cross-validation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=repeats, random_state=1)
	# create model
	model = LogisticRegression()
	# evaluate model
	scores = cross_val_score(model, X, y, scoring='f1', cv=cv, n_jobs=-1)
	return scores

repeats = range(1,20)
results = list()
for r in repeats:
	# evaluate using a given number of repeats
	scores = evaluate_model(X, y.values.ravel(), r)
	# summarize
	print('>%d mean=%.4f se=%.3f' % (r, np.mean(scores), np.std(scores)))
	# store
	results.append(scores)
# plot the results
plt.boxplot(results, labels=[str(r) for r in repeats], showmeans=True)
plt.show()