In [None]:
import os
import sys
import warnings
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sn
import statsmodels.api as sm
from ecgdetectors import Detectors
from petastorm import make_reader
from sklearn.metrics import auc,roc_curve,precision_recall_curve,roc_auc_score,RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import (RepeatedStratifiedKFold, cross_val_score,
                                     train_test_split)

warnings.simplefilter(action='ignore', category=FutureWarning)
sys.path.append(os.path.join(os.getcwd(), ".."))
import shared_utils.utils_data as utils_data
from shared_utils import Logistic_reg_model
from Metrics.Wrapper_main_function import compute_metrics, save_metrics_to_xarray

path_formatted_glasgow = "/workspaces/maitrise/data/20221006_physio_quality/set-a/dataParquet"
path_petastorm = f"file:///{path_formatted_glasgow}"

In [None]:
save_path = "/workspaces/maitrise/results"
name_method = ["Corr_interlead","Corr_intralead","wPMF","SNRECG","HR","Kurtosis","Flatline","TSD"]

if not os.path.exists(save_path):
    os.makedirs(save_path)
os.path.join(save_path,"")
if not "quality_metrics.nc" in os.listdir(save_path):
    print("Computing metrics")
    if not "ecg_data.nc" in os.listdir(save_path):
        ds_data = utils_data.format_data_to_xarray(path_petastorm, save_path)
    else:
        ds_data = xr.load_dataset(os.path.join(save_path,"ecg_data.nc"))

    ds_metrics = save_metrics_to_xarray(ds_data, name_method, save_path, verbose = True)
else:
    ds_metrics = xr.load_dataset(os.path.join(save_path,"quality_metrics.nc"))

In [None]:
###Save summary table into a folder 
def save_table(path_data,summary,name_folder):
    tab1 = "Results_logit"
    tab2 = "Coefficient_results"
    path_to_folder = os.path.join(path_data,name_folder)
    if not os.path.exists(path_to_folder):
        os.mkdir(path_to_folder)
    plt.rc('figure', figsize=(12, 7))
    #plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}) old approach
    plt.text(0.01, 0.05, str(summary), {'fontsize': 17}, fontproperties = 'monospace') # approach improved by OP -> monospace!
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(os.path.join(path_to_folder,'output.png'))
    for i,t in zip(range(0,2),[tab1,tab2]):
        summary.tables[i].to_csv(os.path.join(path_to_folder,t+".csv"))


In [None]:
ds_filtered = ds_metrics.where(ds_metrics.data_quality != "unlabeled").dropna(dim = "id")

np_metrics = ds_filtered.quality_metrics.values
metrics_names = ds_filtered.metric_name.values.tolist()
np_label = ds_filtered.data_quality.values
np_label[np_label == "acceptable" ] = 1
np_label[np_label == "unacceptable" ] = 0
np_label = np_label.astype(int)

In [None]:
X = np_metrics.mean(axis = 1)
df_X = pd.DataFrame(X, columns =metrics_names )
df_y = pd.DataFrame(np_label, columns = ["y"])

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X,os_data_y=smote.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

In [None]:
##Check Backward modele selection on SMOTE and Non SMOTE dataset :
SMOTE_feature = Logistic_reg_model.Backward_model_selection(os_data_X,os_data_y)
print(SMOTE_feature)

Normal_feature = Logistic_reg_model.Backward_model_selection(df_X,df_y)
print(Normal_feature)

In [None]:
###We will fit a Logistic model on the "SMOTED" train dataset
logit_model=sm.Logit(os_data_y,os_data_X)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"all_features_SMOTE")

In [None]:
cols = os_data_X.columns.tolist()
cols.remove("Kurtosis")
logit_model=sm.Logit(os_data_y,os_data_X[cols])
result=logit_model.fit()
print(result.summary2())

#ave_table(save_path,result.summary2(),"Kurtosis_rm_features_SMOTE")

In [None]:
cols.remove("Flatline")
logit_model=sm.Logit(os_data_y,os_data_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Flatline_rm_features_SMOTE")

In [None]:
cols.remove("Corr_intralead")
logit_model=sm.Logit(os_data_y,os_data_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"intralead_rm_features_SMOTE")

In [None]:
cols.remove("HR")
logit_model=sm.Logit(os_data_y,os_data_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Backward_features_SMOTE")

In [None]:
##Check impact of TSD:
cols.remove("TSD")
logit_model=sm.Logit(os_data_y,os_data_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"TSD_rm_features_SMOTE")

In [None]:
##Without taking into account class imbalanced : 
logit_model = sm.Logit(df_y,df_X)
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"all_features")

In [None]:
cols = df_X.columns.tolist()
cols.remove("Kurtosis")
logit_model=sm.Logit(df_y,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Kurtosis_rm_features")

In [None]:
cols.remove("Flatline")
logit_model=sm.Logit(df_y,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Flatline_rm_features")

In [None]:
cols.remove("Corr_intralead")
logit_model=sm.Logit(df_y,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"intralead_rm_features")

In [None]:
cols.remove("wPMF")
logit_model=sm.Logit(df_y,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"wPMF_rm_features")

In [None]:
cols.remove("HR")
logit_model=sm.Logit(df_y,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Backward_rm_features")

In [None]:
##Without TSD:
cols.remove("TSD")
logit_model=sm.Logit(df_y,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"TSD_rm_features")

In [None]:
##Without Balanced dataset
X_new_2 = df_X[cols] 
logit_model = sm.Logit(df_y,X_new_2)
result=logit_model.fit()
print(result.summary2())


In [None]:
##Extraclassifier feature selected with data balanced 
cols = ["wPMF","Corr_interlead","SNRECG"]
X_new_new = os_data_X[cols]

logit_model = sm.Logit(os_data_y,X_new_new)
result=logit_model.fit()
print(result.summary2())

In [None]:
##using data balance 
X_new_2 = df_X[cols] 
logit_model = sm.Logit(df_y,X_new_2)
result=logit_model.fit()
print(result.summary2())

In [None]:
##using feature from regularization
cols = ["wPMF","Corr_interlead","SNRECG","Corr_intralead"]
X_new_new = os_data_X[cols]

logit_model = sm.Logit(os_data_y,X_new_new)
result=logit_model.fit()
print(result.summary2())

In [None]:

X_new_2 = df_X[cols] 
logit_model = sm.Logit(df_y,X_new_2)
result=logit_model.fit()
print(result.summary2())

In [None]:
###Feature selection : selectKbest with mutula_info_classif
bestfeatures = SelectKBest(score_func = mutual_info_classif,k=8)
fit = bestfeatures.fit(df_X,df_y.values.ravel())
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(df_X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(len(metrics_names),'Score'))  #print 10 best features

In [None]:
model_imb = ExtraTreesClassifier()
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y.values.ravel(), test_size=0.3, random_state=0)
model_imb.fit(X_train,y_train.ravel())
print(model_imb.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model_imb.feature_importances_, index=df_X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title("ExtraTreesClassifier for features selection, fitted on original training dataset")
plt.grid()
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y.values.ravel(), test_size=0.3, random_state=0)

columns = X_train.columns

os_data_X,os_data_y=smote.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns)
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

model = ExtraTreesClassifier()
model.fit(os_data_X,os_data_y.values.ravel())
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=os_data_X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title("ExtraTreesClassifier results for features selection, fitted on SMOTED training dataset")
plt.grid()
plt.show()

In [None]:
corrmat = df_X.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sn.heatmap(df_X[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
###Final trial : Do feature selection by using regularization : L1 and L2 (we will only do this on the original dataset)

from sklearn.feature_selection import SelectFromModel

X_train, X_test, y_train, y_test_= train_test_split(df_X, df_y.values.ravel(), test_size=0.3, random_state=0)
os_data_X,os_data_y=smote.fit_resample(X_train, y_train)
columns = X_train.columns
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l2'))

sel_.fit(os_data_X,os_data_y)

selected_feat =os_data_X.columns[(sel_.get_support())]
print(selected_feat)
print('total features: {}'.format((os_data_X.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

In [None]:
####We will train Logistic regression model on SMOTED dataset and used the test dataset from the initial dataset (so Imbalanced)

X_train, X_test, y_train, y_test_balanced = train_test_split(df_X, df_y.values.ravel(), test_size=0.3, random_state=0)

columns = X_train.columns

os_data_X,os_data_y=smote.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
cols = ["Corr_interlead","SNRECG","TSD","wPMF"]
os_data_X = os_data_X[cols]


logreg_balanced = LogisticRegression()
logreg_balanced.fit(os_data_X, os_data_y)

x_test_balanced = pd.DataFrame(data = X_test,columns = columns)
x_test_balanced = x_test_balanced[cols].to_numpy()
y_pred_balanced = logreg_balanced.predict(x_test_balanced)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg_balanced.score(x_test_balanced, y_test_balanced)))

cm = confusion_matrix(y_test_balanced, y_pred_balanced)
print(cm)

print(classification_report(y_test_balanced, y_pred_balanced))

In [None]:
###We create logistic regression model based on what was found by each feature selection model

##Using our Backward model selection feature : 
plt.figure()
Logistic_reg_model.ROC_CV_curve(df_X,df_y,cols = None)

In [None]:
##Using estimator from SMOTE:
plt.figure()
Logistic_reg_model.ROC_CV_curve(df_X,df_y,cols = SMOTE_feature)

In [None]:
##Removing TSD : 
##Using our Backward model selection feature :
Normal_feature.remove("Corr_interlead") 
plt.figure()
Logistic_reg_model.ROC_CV_curve(df_X,df_y,k_cv=10,cols = Normal_feature)

In [None]:
##ExtraTreeClassifier
plt.figure()
cols_extra = ["Corr_intralead","Corr_interlead","SNRECG"]
Logistic_reg_model.ROC_CV_curve(df_X,df_y,cols = cols_extra)