In [None]:
import os
import sys
import warnings
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from ecgdetectors import Detectors
from petastorm import make_reader
from sklearn.metrics import auc,roc_curve,precision_recall_curve,roc_auc_score,RocCurveDisplay
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import (RepeatedStratifiedKFold, cross_val_score,
                                     train_test_split)
warnings.simplefilter(action='ignore', category=FutureWarning)
sys.path.append(os.path.join(os.getcwd(), ".."))
import shared_utils.utils_data as utils_data
from shared_utils import Logistic_reg_model


path_formatted_glasgow = "/workspaces/maitrise/data/20221006_physio_quality/set-a/dataParquet"
path_petastorm = f"file:///{path_formatted_glasgow}"

In [None]:
save_path = "/workspaces/ecg_evaluation/results"
name_method = ["Corr_interlead","Corr_intralead","wPMF","SNRECG","HR","Kurtosis","Flatline","TSD"]

if not os.path.exists(save_path):
    os.makedirs(save_path)
os.path.join(save_path,"")
if not "quality_metrics.nc" in os.listdir(save_path):
    print("Computing metrics")
    if not "ecg_data.nc" in os.listdir(save_path):
        ds_data = utils_data.format_data_to_xarray(path_petastorm, save_path)
    else:
        ds_data = xr.load_dataset(os.path.join(save_path,"ecg_data.nc"))

    ds_metrics = save_metrics_to_xarray(ds_data, name_method, save_path, verbose = True)
else:
    ds_metrics = xr.load_dataset(os.path.join(save_path,"quality_metrics.nc"))

In [None]:
###Save summary table into a folder 
def save_table(path_data,summary,name_folder):
    tab1 = "Results_logit"
    tab2 = "Coefficient_results"
    path_to_folder = os.path.join(path_data,name_folder)
    if not os.path.exists(path_to_folder):
        os.mkdir(path_to_folder)
    plt.rc('figure', figsize=(12, 7))
    #plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}) old approach
    plt.text(0.01, 0.05, str(summary), {'fontsize': 17}, fontproperties = 'monospace') # approach improved by OP -> monospace!
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(os.path.join(path_to_folder,'output.png'))
    for i,t in zip(range(0,2),[tab1,tab2]):
        summary.tables[i].to_csv(os.path.join(path_to_folder,t+".csv"))


In [None]:
ds_filtered = ds_metrics.where(ds_metrics.data_quality != "unlabeled").dropna(dim = "id")

np_metrics = ds_filtered.quality_metrics.values
metrics_names = ds_filtered.metric_name.values.tolist()
np_label = ds_filtered.data_quality.values
##Opposite labelling : instead of labelling 1 as acceptable, we label 1 as unacceptable : 
opposite = True
reverseUNO_y = np_label.copy()
original_label = np_label.copy()
reverseUNO_y[np_label == "acceptable" ] = 0
reverseUNO_y[np_label == "unacceptable" ] = 1
reverseUNO_y = reverseUNO_y.astype(int)
original_label[np_label == "acceptable" ] = 1
original_label[np_label == "unacceptable" ] = 0
original_label = original_label.astype(int)

In [None]:
HR_index = metrics_names.index("HR")
HR_metrics = np_metrics[:,:,HR_index].min(axis=1)
X = np_metrics.mean(axis = 1)
X[:,HR_index] = HR_metrics
df_X = pd.DataFrame(X, columns =metrics_names )
df_y_normal = pd.DataFrame(original_label, columns = ["y"])
df_y_reverse = pd.DataFrame(reverseUNO_y, columns = ["y"])

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=0)
columns = X_train.columns

os_data_X,os_data_y=smote.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns )
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])
# we can Check the numbers of our data
print("length of oversampled data is ",len(os_data_X))
print("Number of no subscription in oversampled data",len(os_data_y[os_data_y['y']==0]))
print("Number of subscription",len(os_data_y[os_data_y['y']==1]))
print("Proportion of no subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==0])/len(os_data_X))
print("Proportion of subscription data in oversampled data is ",len(os_data_y[os_data_y['y']==1])/len(os_data_X))

In [None]:
##Check Backward modele selection on SMOTE and Non SMOTE dataset :
SMOTE_feature = Logistic_reg_model.Backward_model_selection(os_data_X,os_data_y)
print(SMOTE_feature)

Normal_feature = Logistic_reg_model.Backward_model_selection(df_X,df_y_normal)
print(Normal_feature)

In [None]:
##What we found using JMI : 
Logistic_reg_model.JMI_calculator(df_X,df_y_normal)

In [None]:
###We will fit a Logistic model on the "SMOTED" train dataset

HR_index = list(os_data_X.columns.values).index("HR")


logit_model=sm.Logit(os_data_y,os_data_X)
result=logit_model.fit()
print(result.summary2())
#save_table(save_path,result.summary2(),"all_features_SMOTE")

In [None]:
##Check impact of TSD:
cols = ['Corr_interlead', 'wPMF', 'HR', 'TSD']
logit_model=sm.Logit(os_data_y,os_data_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Backward_model_selection_features_SMOTE")

In [None]:
#without TSD :
cols.remove("TSD")
logit_model=sm.Logit(os_data_y,os_data_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Backward_model_selection_features_SMOTE")

In [None]:
##Without taking into account class imbalanced : 
logit_model = sm.Logit(df_y_normal,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"all_features")

In [None]:
cols = ['Corr_interlead', 'HR', 'SNRECG', 'TSD']
logit_model=sm.Logit(df_y_normal,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"Backward_model_selection_features")

In [None]:
cols.remove("TSD")
logit_model=sm.Logit(df_y_normal,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"rm_TSD_features_SMOTE")

In [None]:
cols = ['Corr_interlead', 'Corr_intralead', 'wPMF', 'SNRECG', 'HR']
logit_model=sm.Logit(df_y_normal,df_X[cols])
result=logit_model.fit()
print(result.summary2())
save_table(save_path,result.summary2(),"L2_reg_features")

In [None]:
###Feature selection : selectKbest with mutula_info_classif
bestfeatures = SelectKBest(score_func = mutual_info_classif,k=8)
fit = bestfeatures.fit(df_X,df_y_normal.values.ravel())
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(df_X.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(len(metrics_names),'Score'))  #print 10 best features

In [None]:
model_imb = ExtraTreesClassifier()
cols = df_X.columns.values
print(cols)
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y_normal.values.ravel(), test_size=0.3, random_state=0)
model_imb.fit(X_train,y_train.ravel())
print(model_imb.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model_imb.feature_importances_, index=df_X.columns)
#feat_importances.reindex(cols)
print(feat_importances)
feat_importances.nlargest(10).plot(kind='barh')
plt.title("ExtraTreesClassifier for features selection fitted on original training dataset (score : GINI)")
plt.xlabel("Feature importance")
plt.grid()
plt.show()

In [None]:
#cols = ['wPMF', 'SNRECG',"HR"]
Logistic_reg_model.ExtraTreeClassifier_CV_Feature_selection(df_X,df_y_normal)

In [None]:
Logistic_reg_model.Kbest_MutulaInformation_CV(df_X,df_y_normal)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_X, df_y_normal.values.ravel(), test_size=0.3, random_state=0)

columns = X_train.columns

os_data_X,os_data_y=smote.fit_resample(X_train, y_train)
os_data_X = pd.DataFrame(data=os_data_X,columns=columns)
os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])

model = ExtraTreesClassifier()
model.fit(os_data_X,os_data_y.values.ravel())
print(model.feature_importances_) #use inbuilt class feature_importances of tree based classifiers
#plot graph of feature importances for better visualization
feat_importances = pd.Series(model.feature_importances_, index=os_data_X.columns)
feat_importances.nlargest(10).plot(kind='barh')
plt.title("ExtraTreesClassifier results for features selection, fitted on SMOTED training dataset")
plt.xlabel("Feature importance")
plt.grid()
plt.show()

In [None]:
corrmat = df_X.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sn.heatmap(df_X[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
###Final trial : Do feature selection by using regularization : L1 and L2 (we will only do this on the original dataset)

from sklearn.feature_selection import SelectFromModel

X_train, X_test, y_train, y_test_= train_test_split(df_X, df_y_normal.values.ravel(), test_size=0.3, random_state=0)
columns = X_train.columns
HRindex = list(X_train.columns).index("HR")
os_data_X = pd.DataFrame(data=X_train,columns=columns )
os_data_y= pd.DataFrame(data=y_train,columns=['y'])
sel_ = SelectFromModel(LogisticRegression(C=1, penalty='l2'))

sel_.fit(os_data_X,os_data_y)

selected_feat =os_data_X.columns[(sel_.get_support())]
print(selected_feat)
print('total features: {}'.format((os_data_X.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(
      np.sum(sel_.estimator_.coef_ == 0)))

In [None]:
###We create logistic regression model based on what was found by each feature selection model

##Using our Backward model selection feature : 
Logistic_reg_model.ROC_PR_CV_curve_model(df_X,df_y_normal,cols = ["TSD","Corr_interlead","HR","SNRECG"],k_cv = 10,pos_label = 0)

In [None]:
plt.figure()
Logistic_reg_model.save_model_LR(df_X,df_y_normal,cols = ["TSD","Corr_interlead","HR","SNRECG"],opp  =False)

In [None]:
Logistic_reg_model.Classification_report_model(df_X,df_y_normal,cols = ["Corr_interlead","HR","SNRECG","TSD"])

In [None]:
plt.figure()
Logistic_reg_model.ROC_PR_CV_curve_model(df_X,df_y_normal,cols = ["Corr_interlead","HR","SNRECG"],k_cv = 10,opp = False)

In [None]:
Logistic_reg_model.Classification_report_model(df_X,df_y_normal,cols = ["Corr_interlead","HR","SNRECG"])

In [None]:
Logistic_reg_model.save_model_LR(df_X,df_y_normal,cols = ["Corr_interlead","HR","SNRECG"],opp  =False)

In [None]:
Logistic_reg_model.save_model_LR(df_X,df_y_normal,cols = ["Corr_interlead","HR","SNRECG","Corr_intralead"],opp  =False)

In [None]:
Logistic_reg_model.Classification_report_model(df_X,df_y_normal,cols = ["Corr_interlead","HR","SNRECG","Corr_intralead"])

In [None]:
Logistic_reg_model.ROC_PR_CV_curve_model(df_X,df_y_normal,cols = ["Corr_interlead","HR","SNRECG","Corr_intralead"],k_cv = 10,opp = False)

In [None]:

plt.figure()
Logistic_reg_model.ROC_PR_CV_curve_model(df_X,df_y_normal,cols = ['Corr_interlead', 'Corr_intralead', 'wPMF', 'SNRECG', 'HR'],k_cv = 10,opp = False)

In [None]:
Logistic_reg_model.ROC_PR_CV_curve_model(df_X,df_y_reverse,cols = ["wPMF"],k_cv = 10)

In [None]:
##Using estimator from SMOTE:
Logistic_reg_model.Classification_report_model(df_X,df_y_normal,cols = ["wPMF"])

In [None]:
Logistic_reg_model.Classification_report_model(df_X,df_y_normal,cols = ["TSD","Corr_interlead","HR","SNRECG"])

In [None]:
Logistic_reg_model.Classification_report_model(df_X,df_y_normal,cols = ["wPMF"])

In [None]:
Logistic_reg_model.old_threshold_calculator(df_X,df_y_normal,cols = ["TSD","Corr_interlead","HR","SNRECG"])

In [None]:
cols_g = [["TSD","Corr_interlead","HR","SNRECG"],["Corr_interlead","HR","SNRECG","Corr_intralead"],['Corr_interlead', 'Corr_intralead', 'wPMF', 'SNRECG', 'HR']]
name_model = ["Backward selection","JMI/MI","L2 Reg"]
Logistic_reg_model.Global_comp_ROC_PR_mean_curve(df_X,df_y_normal,cols_g,name_model,pos_label=0)