In [None]:
from petastorm import make_reader
import numpy as np
import matplotlib.pyplot as plt
from ecgdetectors import Detectors
from sklearn.metrics import confusion_matrix,auc
import sys
from sklearn import metrics
import pandas as pd
import seaborn as sn
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold,cross_val_score
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
sys.path.append(os.path.join(os.getcwd(), ".."))
from Metrics import Wrapper_main_function
path_formatted_glasgow = "/workspaces/maitrise/data/20221006_physio_quality/set-a/dataParquet"
path_petastorm = f"file:///{path_formatted_glasgow}"
path_csv_ref_label = "/workspaces/maitrise/data/20221006_physio_quality/set-a/REFERENCE.csv"

In [None]:
label_ref = pd.read_csv(path_csv_ref_label)
label_ref = label_ref.to_numpy()
Y = label_ref[:,1].copy()
Y_true = Y[Y.copy()!="unlabeled"]
X_true = label_ref[:,0].copy()
X_true = X_true[Y!="unlabeled"].astype(int)

In [None]:
len_patients = 998 ## patients have undetermined label for the 2011 physionet dataset
len_lead = 12
Data = np.empty([len_patients,len_lead,5000])
index_patient  = np.array([])
ind = 0

with make_reader(path_petastorm) as reader:
    for sample in reader:
        data = sample
        if data.signal_quality == "unlabeled".encode():
            continue
        else :
            ECG_lead = sample.signal_names
            fs = sample.sampling_frequency
            status = int(sample.noun_id)
            index_patient = np.append(index_patient,status)
            dico_ECG = np.zeros([len(ECG_lead),sample.signal.shape[0]])
            for i in range(len(sample.signal_names)):
                dico_ECG[i,:] = sample.signal[:,i]
            Data[ind,:,:] = dico_ECG
            ind += 1

Data = Data[np.argsort(index_patient),:,:]##ordered your data with your label

In [None]:
name_method = ["Corr_interlead","Corr_intralead","wPMF","SNRECG","HR","Kurtosis","Flatline","TSD"]
Matrix_features = np.empty([len(index_patient),len_lead,len(name_method)])

for j in range(Matrix_features.shape[0]):
    Patient = Data[j,:,:]
    Matrix_features[j,:,:] = Wrapper_main_function.main(Patient,fs,name_method)



In [None]:
X = np.empty([len(index_patient),len(name_method)])
for x in range(X.shape[0]):
    X[x,:] = np.array([np.mean(Matrix_features[x,:,i]) for i in range(Matrix_features.shape[2])])

Ycop = Y_true.copy()
Ycop[Y_true=="acceptable"] = 1
Ycop[Y_true=="unacceptable"] = 0
Ycop = Ycop.astype(int)
X = pd.DataFrame(X,columns = name_method)
y = pd.DataFrame(Ycop,columns = ["y"])

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X,os_data_y=smote.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

In [None]:
logit_model=sm.Logit(os_data_y,os_data_X)
result=logit_model.fit()
print(result.summary2())

In [None]:
##Without taking into account class imbalanced : 
logit_model = sm.Logit(y,X)
result=logit_model.fit()
print(result.summary2())

In [None]:
cols = ["Corr_interlead","Corr_intralead","SNRECG","TSD"]
X_new = os_data_X[cols]

logit_model = sm.Logit(os_data_y,X_new)
result=logit_model.fit()
print(result.summary2())

In [None]:
##Without taking into account class imbalanced :
X_new_1 = X[cols] 
logit_model = sm.Logit(y,X_new_1)
result=logit_model.fit()
print(result.summary2())

In [None]:
cols = ["Corr_interlead","Corr_intralead","SNRECG"]
X_new = os_data_X[cols]

logit_model = sm.Logit(os_data_y,X_new)
result=logit_model.fit()
print(result.summary2())

In [None]:
##Without taking into account class imbalanced :
X_new_2 = X[cols] 
logit_model = sm.Logit(y,X_new_2)
result=logit_model.fit()
print(result.summary2())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_new, os_data_y.values.ravel(), test_size=0.3, random_state=0)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(y_test, logreg.predict(X_test))
fpr, tpr, thresholds = roc_curve(y_test, logreg.predict_proba(X_test)[:,1])
plt.figure()
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.show()

In [None]:
from sklearn.metrics import auc
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test, logreg.predict_proba(X_test)[:,1])
logit_roc_auc = auc(recall,precision)

plt.figure()
plt.plot(recall, precision, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 0],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall curve')
plt.legend(loc="lower right")
#plt.savefig('Log_PR')
plt.show()

In [None]:
cv = RepeatedStratifiedKFold(n_splits = 15,n_repeats = 20,random_state = 0)
model = LogisticRegression()
scores = cross_val_score(model,X,y.values.ravel(),scoring='f1', cv=cv, n_jobs=-1,)
print('F1-score: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
def evaluate_model(X, y, repeats):
	# prepare the cross-validation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=repeats, random_state=1)
	# create model
	model = LogisticRegression()
	# evaluate model
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

repeats = range(1,20)
results = list()
for r in repeats:
	# evaluate using a given number of repeats
	scores = evaluate_model(X, y.values.ravel(), r)
	# summarize
	print('>%d mean=%.4f se=%.3f' % (r, np.mean(scores), np.std(scores)))
	# store
	results.append(scores)
# plot the results
plt.boxplot(results, labels=[str(r) for r in repeats], showmeans=True)
plt.show()