In [None]:
import sys
sys.path.append('C:\\Users\\AFischer\\PycharmProjects\\term_preterm_database')
sys.path.append('C:\\Users\\AFischer\\PycharmProjects\\term_preterm_database\\src_pre_term_database')

In [None]:
import wfdb
import numpy as np
from chart_studio import plotly as py
import plotly.graph_objs as go
from plotly.offline import iplot, init_notebook_mode
from plotly.subplots import make_subplots
import cufflinks 
from pathlib import Path
cufflinks.go_offline(connected=True)
init_notebook_mode(connected=True)

In [None]:
import constants as c
from src_pre_term_database.load_dataset import build_signal_dataframe, build_demographics_dataframe, build_clinical_information_dataframe, split_term_preterm_rec_ids
from src_pre_term_database.visualizing import create_missing_values_plot, plot_distribution_feature, plot_histogram, plot_boxplot, plot_multiple_boxplots, plot_differences_preterm_and_term_patients, plot_mean_and_std_variables, plot_ehg_data
from utils import read_settings, calculate_percentage

In [None]:
settings_path = 'C:/Users/AFischer/PycharmProjects/term_preterm_database/references/settings'

file_paths = read_settings(settings_path, 'file_paths')

#logging.basicConfig(filename='example.log', level=logging.INFO)

SIGNAL_COLUMN_NAMES = ['1', '1_DOCFILT-4-0.08-4', '1_DOCFILT-4-0.3-3', '1_DOCFILT-4-0.3-4',
                      '2', '2_DOCFILT-4-0.08-4', '2_DOCFILT-4-0.3-3', '2_DOCFILT-4-0.3-4',
                      '3', '3_DOCFILT-4-0.08-4', '3_DOCFILT-4-0.3-3', '3_DOCFILT-4-0.3-4']

VARIABLE_CONSTANTS_LIST = ['RecID', 'Age', 'Parity', 'Abortions', 'Weight',
                           'Hypertension', 'Diabetes', 'Placental_position', 'Bleeding_first_trimester',
                           'Bleeding_second_trimester', 'Funneling', 'Smoker']

In [None]:
data_path = file_paths['data_path']

In [None]:
data_path

In [None]:
df_signals = build_signal_dataframe(data_path, settings_path)

In [None]:
df_signals.head()

In [None]:
len(df_signals)

In [None]:
df_demographics = build_demographics_dataframe(data_path, settings_path)

In [None]:
df_demographics.head()

In [None]:
df_clinical_information = build_clinical_information_dataframe(data_path, settings_path)

In [None]:
df_clinical_information.head()

In [None]:
df_static_information = df_demographics.merge(df_clinical_information, how='left', on=c.REC_ID_NAME)

In [None]:
df_static_information.head()

In [None]:
df_static_information['time_between_rec_and_birth'] = df_static_information['gestation']-df_static_information['gestation_at_rec_time']

In [None]:
df_preterm, df_term = split_term_preterm_rec_ids(df_static_information, preterm_threshold=37.0)

In [None]:
categorical_features = ['group', c.HYPERTENSION_NAME, c.DIABETES_NAME, c.PLACENTAL_NAME,
                            c.BLEEDING_FIRST_TRI_NAME, c.BLEEDING_SEC_TRI_NAME,
                            c.FUNNELING_NAME, c.SMOKER_NAME]

for cat_feature in categorical_features:
    plot_distribution_feature(df_static_information, f'{cat_feature}', categorical=True)
    

In [None]:
output_path = Path(file_paths['output_path'])
intermediate_path = output_path / "intermediate_datafiles" / 'df_static.csv'

In [None]:
df_static_information.to_csv(intermediate_path, sep=';')

In [None]:
plot_distribution_feature(df_static_information, 'group', categorical=True)

In [None]:
plot_distribution_feature(df_static_information, 'abortions', categorical=False)

In [None]:
plot_distribution_feature(df_static_information, 'age', categorical=False)

In [None]:
plot_distribution_feature(df_static_information, 'weight', categorical=False)

In [None]:
plot_histogram(df_static_information, 'gestation')

In [None]:
plot_histogram(df_static_information, 'gestation_at_rec_time')

In [None]:
4/7

In [None]:
plot_boxplot(df_static_information, 'gestation_at_rec_time')

In [None]:
plot_boxplot(df_static_information, 'gestation')

In [None]:
plot_boxplot(df_static_information, 'age')

In [None]:
plot_multiple_boxplots(df_static_information, ['weight', 'age', 'gestation', 'gestation_at_rec_time'])

In [None]:
plot_differences_preterm_and_term_patients(df_preterm, df_term, 'abortions', categorical=False)

In [None]:
plot_differences_preterm_and_term_patients(df_preterm, df_term, 'smoker', categorical=True)

In [None]:
plot_differences_preterm_and_term_patients(df_preterm, df_term, 'parity', categorical=False)

In [None]:
plot_differences_preterm_and_term_patients(df_preterm, df_term, 'hypertension', categorical=True)

In [None]:
plot_differences_preterm_and_term_patients(df_preterm, df_term, 'placental_position', categorical=True)

In [None]:
plot_mean_and_std_variables(df_preterm, df_term, ['age', 'weight', 'gestation', 'gestation_at_rec_time'])

In [None]:
numeric_features = [c.AGE_NAME, c.PARITY_NAME, c.ABORTIONS_NAME, c.WEIGHT_NAME]
categorical_features = [c.HYPERTENSION_NAME, c.DIABETES_NAME, c.PLACENTAL_NAME,
                            c.BLEEDING_FIRST_TRI_NAME, c.BLEEDING_SEC_TRI_NAME,
                            c.FUNNELING_NAME, c.SMOKER_NAME]
create_missing_values_plot(df_static_information, numeric_features, categorical_features)

## SIGNAL exploratory data analysis

In [None]:
test_record = wfdb.rdrecord('C:/Users/AFischer/Documents/PhD_onderzoek/term_preterm_database/term-preterm-ehg-database-1.0.1/term-preterm-ehg-database-1.0.1/tpehgdb/tpehg546')

In [None]:
display(test_record.__dict__)

In [None]:
plot_ehg_data(data_path, 'tpehg1007', 'minutes', df_static_information)

In [None]:
df_preterm.head()

In [None]:
df_static_information.head()

In [None]:
list(map(lambda word: f"tpehg{word}", df_preterm['rec_id'].unique()))

In [None]:
round(df_static_information['time_between_rec_and_birth'],2).astype(str) + 'weeks'

In [None]:
import pandas as pd

In [None]:
def plot_ehg_data2(path_to_data: str, rec_id: str,
                  time_units: str, df_static_information: pd.DataFrame, **kwargs):
    """"Plot the EHG signal data of one patient (rec_id).

    Parameters
    ----------
    path_to_data : str
        Path to folder with the term-preterm database files.
    rec_id : str
        Name of the record id.
    time_units : str
        The x axis unit. Allowed options are: 'samples', 'seconds', 'minutes',
        and 'hours'.
    df_static_information : pd.DataFrame
        Dataframe that contains the demographic data of the record id.
    kwargs:
        Dictionary of parameters to pass to make_subplots.update_xaxes()

    Returns
    -------
    type : plotly.graph_objs
        Line plot of the EHG signal data of one record id.
    """
    data_path = Path(f'{path_to_data}')
    path_to_signals = data_path / "tpehgdb"

    colors = ['rgb(67,67,67)', 'rgb(115,115,115)', 'rgb(49,130,189)']

    channel_data = ['Channel 1', 
                    'Channel 2', 
                    'Channel 3']

    min_value_signal = -0.5
    max_value_signal = 0.7

    line_size = 2
    grid = [(1, 1), (2, 1), (3, 1)]

    # This record object contains all signal data and its properties (such as sampling rate, etc.) of
    # one record id.
    record = wfdb.rdrecord(f'{path_to_signals}/{rec_id}')
    rec_id = int(record.record_name.split('tpehg')[1])

    # The preterm_term_gestation variable contains the gestation length
    preterm_term_gestation = df_static_information.query('rec_id==@rec_id')['gestation'].iloc[0]
    preterm_term_rec_moment = df_static_information.query('rec_id==@rec_id')['gestation_at_rec_time'].iloc[0]

    # Construct time indices for the x-axis
    if time_units == 'samples':
        t = np.linspace(0, record.sig_len-1, record.sig_len)
    else:
        downsample_factor = {'seconds': record.fs, 'minutes': record.fs * 60,
                             'hours': record.fs * 3600}
        t = np.linspace(0, record.sig_len-1, record.sig_len) / downsample_factor[time_units]

    # We plot each channel in a separate subplot
    fig = make_subplots(rows=3, cols=1,
                        subplot_titles=channel_data)


    fig.add_trace(go.Scatter(x=t, y=record.p_signal[:, 0], mode='lines',
                                 name='</b>Channel 1</b>', line=dict(color=colors[0], width=line_size),
                                 connectgaps=True),
                      row=grid[0][0],
                      col=grid[0][1])
    fig.update_yaxes(title_text=record.units[0], range=[min_value_signal, max_value_signal])
    
    
    
    fig.add_trace(go.Scatter(x=t, y=record.p_signal[:, 4], mode='lines',
                                 name='Channel 2', line=dict(color=colors[1], width=line_size),
                                 connectgaps=True),
                      row=grid[1][0],
                      col=grid[1][1])
    fig.update_yaxes(title_text=record.units[4], range=[min_value_signal, max_value_signal])
    
    
    
    fig.add_trace(go.Scatter(x=t, y=record.p_signal[:, 8], mode='lines',
                                 name='Channel 3', line=dict(color=colors[2], width=line_size),
                                 connectgaps=True),
                      row=grid[2][0],
                      col=grid[2][1])
    fig.update_yaxes(title_text=record.units[8], range=[min_value_signal, max_value_signal], tickfont_family="Arial Black")

    fig.update_layout(template='plotly_white', height=1100, showlegend=False,
                      title=dict(
                          text=f'<b>EHG data of patient {rec_id}, gestation: {preterm_term_gestation} wks, '
                               f'recording moment: {preterm_term_rec_moment} wks</b>',
                          x=0.5,
                          y=0.98,
                          font=dict(
                              family="Arial",
                              size=20,
                              color='#000000'
                          )
                      )

   )
                      
    # dtick indicates the tick step and is set in such way that we have approx. 5 ticks on the x axis
    if 'range' in kwargs:
        dtick = int(np.diff(kwargs['range']) / 5)
    else:
        dtick = int(max(t) / 5)

    fig.update_xaxes(title_text=f'{time_units}', tick0=0, dtick=dtick, tickfont_family="Arial Black", **kwargs)
    fig.show()

In [None]:
data_path = Path(f'{data_path}')
path_to_signals = data_path / "tpehgdb"
record = wfdb.rdrecord(f'{path_to_signals}/tpehg1022')

In [None]:
record.sig_name

In [None]:
plot_ehg_data2(data_path, 'tpehg1022', 'minutes', df_static_information)

In [None]:
plot_ehg_data(data_path, 'tpehg873', 'minutes', df_static_information)

In [None]:
plot_ehg_data(data_path, 'tpehg1022', 'minutes', df_static_information)

In [None]:
plot_ehg_data(data_path, 'tpehg1007', 'minutes', df_static_information)

In [None]:
plot_ehg_data(data_path, 'tpehg1202', 'minutes', df_static_information)

In [None]:
plot_ehg_data(data_path, 'tpehg1202', 'minutes', df_static_information, range=[7, 10])

In [None]:
preterm_rec_ids = list(map(lambda word: f"tpehg{word}", df_preterm['rec_id'].unique()))

In [None]:
for pre_rec_id in preterm_rec_ids:
    plot_ehg_data(data_path, pre_rec_id, 'minutes', df_static_information)

In [None]:
plot_histogram(df_static_information, 'time_between_rec_and_birth')

In [None]:
plot_differences_preterm_and_term_patients(df_preterm, df_term, 'time_between_rec_and_birth', categorical=False)

In [None]:
plot_mean_and_std_variables(df_preterm, df_term, ['time_between_rec_and_birth'])

In [None]:
plot_distribution_feature(df_static_information, 'time_between_rec_and_birth', categorical=False)

In [None]:
short_time_rec_ids = df_static_information.query('time_between_rec_and_birth < 3')['rec_id'].unique()

short_time_rec_ids = list(map(lambda word: f"tpehg{word}", short_time_rec_ids))

for rec_id in short_time_rec_ids:
    plot_ehg_data(data_path, rec_id, 'minutes', df_static_information)