In [1]:
import warnings
import pandas as pd
import ast
from pandas import json_normalize
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import numpy as np
from transformers import pipeline, AutoTokenizer, AutoModel
import json

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


### Data Analysis

should work for any report type as long as it's formatted properly

In [2]:
nominal_cols = ['SourcePDF', 'Comments', 'DocType', 'ExamDate', 'Device', 'ID', 'Race', 'Occupation', 'Technician', 'RecordingType']

def analyze_data(df):
    df = df.replace('---', pd.NaT)
    fix_data_types(df)
    handle_missing_values(df)
    display_graphs(df)

def fix_data_types(df):
    df['ExamDate'] = pd.to_datetime(df['ExamDate'], format='%d-%m-%Y')
    
    #convert all columns to numeric (excecpt ['SourcePDF', 'Notes', 'DocType', 'ExamDate'])
    for col in df.columns:
        if col not in nominal_cols:
            df[col] = pd.to_numeric(df[col], errors='coerce')


def handle_missing_values(df):

    #missing_values = df.isnull().sum()
    #print(missing_values)

    #all body position stats columns have missing values replace with 0 (no events or time spent in that position)
    for col in df.columns:
        if 'Position' in col:
            df[col] = df[col].fillna(0)

    #for now we'll replace missing values with the mean, futurely maybe we can try imputing with a model (though we dont have a lot of data rn)
    for col in df.columns:
        if col not in nominal_cols:
            df[col] = df[col].fillna(df[col].mean())

    #! NOTE: patient measurements (height, weight, age) are always missing on short reports, as such we wont be able to use them for analysis for now.


def univariate_analysis(df, column_name):
    if column_name not in df.columns:
        return f"Column {column_name} not found in DataFrame."
    
    if df[column_name].dtype not in ['int64', 'float64', 'int32', 'float32']:
        return f"Column {column_name} is not numeric."
    
    data = df[column_name]
    
    # Calculate statistics
    mean_val = data.mean()
    median_val = data.median()
    min_val = data.min()
    max_val = data.max()
    kurtosis_val = data.kurtosis()
    skewness_val = data.skew()
    std_dev = data.std()
    variance = data.var()

    print(f"Mean: {mean_val}")
    print(f"Median: {median_val}")
    print(f"Min: {min_val}")
    print(f"Max: {max_val}")
    print(f"Kurtosis: {kurtosis_val}")
    print(f"Skewness: {skewness_val}")
    print(f"Standard Deviation: {std_dev}")
    print(f"Variance: {variance}")

    return {
        'Mean': mean_val,
        'Median': median_val,
        'Min': min_val,
        'Max': max_val,
        'Kurtosis': kurtosis_val,
        'Skewness': skewness_val,
        'Standard Deviation': std_dev,
        'Variance': variance
    }

def bivariate_analysis(df, column_name1, column_name2):

    #scatter plot
    plt.figure(figsize=(10, 6))
    sns.scatterplot(data=df, x=df[column_name1], y=df[column_name2])
    plt.title(f"Scatter plot {column_name1} vs {column_name2}")
    plt.xlabel(column_name1)
    plt.ylabel(column_name2)
    plt.show()

    #correlation
    pearson_corr, pearson_p = stats.pearsonr(df[column_name1], df[column_name2])
    spearman_corr, spearman_p = stats.spearmanr(df[column_name1], df[column_name2])

    print(f"Pearson correlation between {column_name1} and {column_name2}: r = {pearson_corr}, p = {pearson_p}")
    print(f"Spearman correlation between {column_name1} and {column_name2}: r = {spearman_corr}, p = {spearman_p}")

    sns.jointplot(data=df, x=column_name1, y=column_name2, kind='reg')
    plt.show()

    #Box plot
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df[column_name1], y=df[column_name2])
    plt.title(f"Box plot {column_name1} vs {column_name2}")
    plt.xlabel(column_name1)
    plt.ylabel(column_name2)
    plt.show()

    return {
        'Pearson Correlation': pearson_corr,
        'Pearson P-value': pearson_p,
        'Spearman Correlation': spearman_corr,
        'Spearman P-value': spearman_p
    }
    


def display_graphs(df):
    numeric_cols = df.select_dtypes(include=np.number).columns

    #plot histograms for all numeric columns
    for col in numeric_cols:
        plt.figure()
        sns.histplot(df[col])
        plt.title(col)
        plt.show

    #plot correlation matrix
    corr = df[numeric_cols].corr()
    plt.figure(figsize=(20, 15))
    sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".1f")
    plt.show()

    #plot scatter plots for all columns
    for col in numeric_cols:
        
        plt.figure()
        sns.scatterplot(data=df, x=col, y='SleepEfficiency')
        plt.title(col)
        plt.show()

def medi_albertina_comment_analysis(df):
    ner_pipeline = pipeline('ner', model='portugueseNLP/medialbertina_pt-pt_900m_NER', aggregation_strategy='average')

    # Extract entities from all "Comments", store them in dictionary with SourcePDF as key for each comment.
    # Remove all results with tag "resultado" as they are not relevant for this analysis
    entities = {}
    
    for index, row in df.iterrows():
        if row['Comments'] != pd.NaT:
            entities[row['SourcePDF']] = ner_pipeline(row['Comments'])
            entities[row['SourcePDF']] = [entity for entity in entities[row['SourcePDF']] if entity['entity_group'] != 'Resultado']

            # Convert scores to string for serialization
            for entity in entities[row['SourcePDF']]:
                entity['score'] = str(entity['score'])
                entity['start'] = str(entity['start'])
                entity['end'] = str(entity['end'])

            print(f"Extracted entities from {row['SourcePDF']}")
            print(entities[row['SourcePDF']])

    with open('entities.json', 'w', encoding='utf-8') as f:
        json.dump(entities, f, ensure_ascii=False, indent=4)

    return entities

#! NOT FULLY IMPLEMENTED
def bioBERTpt_comment_analysis(text):

    ner_pipeline = pipeline("ner", model="pucpr/biobertpt-all", tokenizer="pucpr/biobertpt-all")

    ner_results = ner_pipeline(text)

    for entity in ner_results:
        print(f"Entity: {entity['word']}, Label: {entity['entity']}, Confidence: {entity['score']:.2f}")

#### short report notes

Although there is a clear lack of data, we can make some assumptions and reach conclusions based on whats available,
1- the biggest factors contributing to sleep efficiency are the number the total time slept, and total time in rem and nrem
2- apneas and hypopneas seem to have a negative effect on sleep efficiency (as expected)

### REM LOGIC

In [None]:
df = pd.read_csv('../data/csvs/report_summary.csv')
df.columns

In [None]:
df = pd.read_csv('../data/csvs/RemLogicData.csv')
df.columns

In [None]:
df['SnoreDuration']

In [None]:
df = df.replace('---', pd.NaT)
fix_data_types(df)
handle_missing_values(df)
df['Type']


Clean the Data Common In Both RemLogic Reports 

In [None]:
def clean_common_data(df):
    #Clean Total Time Data
    df['total_recording_time'] = df['total_recording_time'].str.replace(' minutes', '').astype(int)

    #Clean Age Data
    cleaned_list = []
    for entry in df['age']:
        entry = entry.strip()  # Remove leading/trailing whitespace
        if entry == '- month(s)' or ' month(s)' in entry:
            cleaned_list.append(None)
        else:
            cleaned_list.append(int(entry.split(' year(s)')[0]))

    # Calculate mean of non-None values
    filtered_data = [x for x in cleaned_list if x is not None]
    mean_value = int(sum(filtered_data) / len(filtered_data))

    # Replace None with mean
    for i in range(len(cleaned_list)):
        if cleaned_list[i] is None:
            cleaned_list[i] = mean_value

    df['age'] = cleaned_list 

    #Clean Height Data
    df['height'] = df['height'].replace("NaN", np.nan)
    df['height'] = df['height'].str.replace(' m', '').str.replace(',', '.').astype(float)
    df['height'] = df['height'].fillna(df['height'].mean())

    #Clean Weight Data
    df['weight'] = df['weight'].replace("NaN", np.nan)
    df['weight'] = df['weight'].str.replace(' kg', '').str.replace(',', '.').astype(float)
    df['weight'] = df['weight'].fillna(df['weight'].mean())

    #Clean Apnea + Hypopnea (A+H) Data
    apnea_hypopnea_count = df['Apnea + Hypopnea (A+H)'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Apnea + Hypopnea (A+H)'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    apnea_hypopnea_rate = rate.str.replace(',', '.').astype(float)
    df['Apnea + Hypopnea (A+H) Rate'] = apnea_hypopnea_rate

    #Clean Snore Time Data
    lst = []
    for i in df['Snore Time']:
        if i == '- - %':
            lst.append('0 minutes 0 %')
        else:
            lst.append(i)

    df['Snore Time'] = lst
    saturation_count = df['Snore Time'].str.split(' minutes', n=1).str[0].str.replace(',', '.').astype(float)
    saturation_rate = df['Snore Time'].str.split(' minutes', n=1).str[1].str.replace('%', '').str.replace(',', '.').astype(float)
    df['Snore Time'] = saturation_count

    #Clean Lowest Oxygen Saturation Data
    df['Lowest Oxygen Saturation'] = df['Lowest Oxygen Saturation'].str.replace(' %', '')
    df['Lowest Oxygen Saturation'] = df['Lowest Oxygen Saturation'].str.replace(',', '.').astype(float)

    #Clean Saturation < 90% Data
    lst = []
    for i in df['Saturation < 90%']:
        if i == '- - %':
            lst.append('0 minutes 0 %')
        else:
            lst.append(i)

    df['Saturation < 90%'] = lst
    saturation_count = df['Saturation < 90%'].str.split(' minutes', n=1).str[0].str.replace(',', '.').astype(float)
    saturation_rate = df['Saturation < 90%'].str.split(' minutes', n=1).str[1].str.replace('%', '').str.replace(',', '.').astype(float)
    df['Saturation < 90% Rate'] = saturation_rate

Clean the Data From **Polysomnography** Report RemLogic 

In [None]:
def clean_data_Polysomnography(df):
    #Clean Sleep Efficiency Data
    df['Sleep Efficiency'] = df['Sleep Efficiency'].str.replace(' %', '')
    df['Sleep Efficiency'] = df['Sleep Efficiency'].str.replace(',', '.').astype(float)

    #Clean Relative Snoring Time Data
    df['Relative Snoring Time'] = df['Relative Snoring Time'].str.replace(' %', '')
    df['Relative Snoring Time'] = df['Relative Snoring Time'].str.replace(',', '.').astype(float)

    #Clean Number of Snoring Episodes Data
    df['Number of Snoring Episodes'] = df['Number of Snoring Episodes'].astype(int)

    #Clean Obstructive Apnea Data
    obstructive_apnea_count = df['Obstructive Apnea'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Obstructive Apnea'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    obstructive_apnea_rate = rate.str.replace(',', '.').astype(float)
    #df['Obstructive Apnea Count'] = apnea_hypopnea_count
    df['Obstructive Apnea Rate'] = obstructive_apnea_rate

    #Clean Central Apnea Data
    central_apnea_count = df['Central Apnea'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Central Apnea'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    central_apnea_rate = rate.str.replace(',', '.').astype(float)
    #df['Obstructive Apnea Count'] = apnea_hypopnea_count
    df['Central Apnea Rate'] = central_apnea_rate

    #Clean Mixed Apnea Data
    mixed_apnea_count = df['Mixed Apnea'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Mixed Apnea'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    mixed_apnea_rate = rate.str.replace(',', '.').astype(float)
    #df['Obstructive Apnea Count'] = mixed_apnea_count
    df['Mixed Apnea Rate'] = mixed_apnea_rate

    #Clean Hypopnea (All) Data
    hypopnea_count = df['Hypopnea (All)'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Hypopnea (All)'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    hypopnea_rate = rate.str.replace(',', '.').astype(float)
    #df['Hypopnea (All) Count'] = mixed_apnea_count
    df['Hypopnea (All) Rate'] = hypopnea_rate

    #Clean Obstructive Hypopnea Data
    lst = []
    for i in df['Obstructive Hypopnea']:
        if i == '- -':
            lst.append('0 0')
        else:
            lst.append(i)

    df['Obstructive Hypopnea'] = lst
    obstructive_hypopnea_count = df['Obstructive Hypopnea'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Obstructive Hypopnea'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    obstructive_hypopnea_rate = rate.str.replace(',', '.').astype(float)
    #df['Hypopnea (All) Count'] = mixed_apnea_count
    df['Obstructive Hypopnea Rate'] = obstructive_hypopnea_rate

    #Clean Central Hypopnea Data
    lst = []
    for i in df['Central Hypopnea']:
        if i == '- -':
            lst.append('0 0')
        else:
            lst.append(i)

    df['Central Hypopnea'] = lst
    central_hypopnea_count = df['Central Hypopnea'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Central Hypopnea'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    central_hypopnea_rate = rate.str.replace(',', '.').astype(float)
    #df['Hypopnea (All) Count'] = mixed_apnea_count
    df['Central Hypopnea Rate'] = central_hypopnea_rate

    #Clean Mixed Hypopnea Data
    lst = []
    for i in df['Mixed Hypopnea']:
        if i == '- -':
            lst.append('0 0')
        else:
            lst.append(i)

    df['Mixed Hypopnea'] = lst
    mixed_hypopnea_count = df['Mixed Hypopnea'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Mixed Hypopnea'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    mixed_hypopnea_rate = rate.str.replace(',', '.').astype(float)
    #df['Hypopnea (All) Count'] = mixed_apnea_count
    df['Mixed Hypopnea Rate'] = mixed_hypopnea_rate

    #Clean Apenea Table Data
    df['apenea'] = df['apenea'].apply(ast.literal_eval)
    df_apnea_expanded = pd.concat([json_normalize(record) for record in df['apenea']], ignore_index=True)
    apenea_list = []
    obstructive_list = []
    central_list = []
    mixed_list = []
    hypopnea_list = []
    obstructive_hypopnea_list = []
    central_hypopnea_list = []
    mixed_hypopnea_list = []

    apenea_list1 = []
    obstructive_list1 = []
    central_list1 = []
    mixed_list1 = []
    hypopnea_list1 = []
    obstructive_hypopnea_list1 = []
    central_hypopnea_list1 = []
    mixed_hypopnea_list1 = []

    lists_number = [
        apenea_list,
        obstructive_list,
        central_list,
        mixed_list,
        hypopnea_list,
        obstructive_hypopnea_list,
        central_hypopnea_list,
        mixed_hypopnea_list
    ]

    lists_AorH = [
        apenea_list1,
        obstructive_list1,
        central_list1,
        mixed_list1,
        hypopnea_list1,
        obstructive_hypopnea_list1,
        central_hypopnea_list1,
        mixed_hypopnea_list1
    ]

    for i in range(10):
        for j, lst in enumerate(lists_number):
            if df['apenea'][i][j]['Number'] == '-':
                lst.append(0)
            else:
                lst.append(int(df['apenea'][i][j]['Number']))

    for i in range(10):
        for j, lst in enumerate(lists_AorH):
            if df['apenea'][i][j]['A or H/h'] == '-':
                lst.append(0)
            else:
                value = df['apenea'][i][j]['A or H/h'].replace(',', '.')
                lst.append(float(value))

    for i in range(8):
        name = f"apenea_{df['apenea'][0][i]['Respiration']}_Number"
        df[name] = lists_number[i]

    for i in range(8):
        name = f"apenea_{df['apenea'][0][i]['Respiration']}_A or H/h"
        df[name] = lists_AorH[i]


    #Clean Position Table Data
    df['position'] = df['position'].apply(ast.literal_eval)
    df_apnea_expanded = pd.concat([json_normalize(record) for record in df['position']], ignore_index=True)
    supine_list = []
    left_list = []
    prone_list = []
    right_list = []
    upright_list = []
    unknown_hypopnea_list = []

    supine_list1 = []
    left_list1 = []
    prone_list1 = []
    right_list1 = []
    upright_list1 = []
    unknown_hypopnea_list1 = []

    lists_number = [
        supine_list,
        left_list,
        prone_list,
        right_list,
        upright_list,
        unknown_hypopnea_list
    ]

    lists_AorH = [
        supine_list1,
        left_list1,
        prone_list1,
        right_list1,
        upright_list1,
        unknown_hypopnea_list1
    ]

    for i in range(10):
        for j, lst in enumerate(lists_number):
            if df['position'][i][j]['Index time'] == '-':
                lst.append(0)
            else:
                value = df['position'][i][j]['Index time'].replace(',', '.')
                lst.append(float(value))

    for i in range(10):
        for j, lst in enumerate(lists_AorH):
            if df['position'][i][j]['A or H/h'] == '-':
                lst.append(0)
            else:
                value = df['position'][i][j]['A or H/h'].replace(',', '.')
                lst.append(float(value))

    for i in range(6):
        name = f"position_{df['position'][0][i]['Position']}_Index time"
        df[name] = lists_number[i]

    for i in range(6):
        name = f"position_{df['position'][0][i]['Position']}_A or H/h"
        df[name] = lists_AorH[i]

Clean the Data From **Polygraphy** Report RemLogic 

In [None]:
def clean_data_Polygraphy(df):
    #Clean BMI Data
    df['BMI'] = df['BMI'].replace("NaN", np.nan)
    df['BMI'] = df['BMI'].str.replace(',', '.').astype(float)
    df['BMI'] = df['BMI'].fillna(df['BMI'].mean())

    #Clean Index Time Data
    df['Index Time'] = df['Index Time'].str.replace(' minutes', '').str.replace(',', '.').astype(float)

    #Clean Supine A+H Data
    apnea_hypopnea_count = df['Supine A+H'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Supine A+H'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    apnea_hypopnea_rate = rate.str.replace(',', '.').astype(float)
    df['Supine A+H Rate'] = apnea_hypopnea_rate

    #Clean Non-Supine A+H Data
    apnea_hypopnea_count = df['Non-Supine A+H'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Non-Supine A+H'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    rate = rate.replace('-', '0')
    apnea_hypopnea_rate = rate.str.replace(',', '.').astype(float)
    df['Non-Supine A+H Rate'] = apnea_hypopnea_rate

    #Clean RDI Data
    """
    df['RDI'] = df['RDI'].replace("NaN", np.nan)
    df['RDI'] = df['RDI'].str.replace(',', '.').astype(float)
    df['RDI'] = df['RDI'].fillna(df['RDI'].mean())
    """

    #Clean Supine Time Data
    lst = []
    for i in df['Supine Time']:
        if i == '- - %':
            lst.append('0 minutes 0 %')
        else:
            lst.append(i)

    df['Supine Time'] = lst
    saturation_count = df['Supine Time'].str.split(' minutes', n=1).str[0].str.replace(',', '.').astype(float)
    saturation_rate = df['Supine Time'].str.split(' minutes', n=1).str[1].str.replace('%', '').str.replace(',', '.').astype(float)
    df['Supine Time'] = saturation_rate

    #Clean Non-Supine Time Data
    lst = []
    for i in df['Non-Supine Time']:
        if i == '- - %':
            lst.append('0 minutes 0 %')
        else:
            lst.append(i)

    df['Non-Supine Time'] = lst
    saturation_count = df['Non-Supine Time'].str.split(' minutes', n=1).str[0].str.replace(',', '.').astype(float)
    saturation_rate = df['Non-Supine Time'].str.split(' minutes', n=1).str[1].str.replace('%', '').str.replace(',', '.').astype(float)
    df['Non-Supine Time'] = saturation_rate

    #Clean Upright Time Data
    lst = []
    for i in df['Upright Time']:
        if i == '- - %':
            lst.append('0 minutes 0 %')
        else:
            lst.append(i)

    df['Upright Time'] = lst
    saturation_count = df['Upright Time'].str.split(' minutes', n=1).str[0].str.replace(',', '.').astype(float)
    saturation_rate = df['Upright Time'].str.split(' minutes', n=1).str[1].str.replace('%', '').str.replace(',', '.').astype(float)
    df['Upright Time'] = saturation_rate

    #Clean Movement Time Data
    lst = []
    for i in df['Movement Time']:
        if i == '- - %':
            lst.append('0 minutes 0 %')
        else:
            lst.append(i)

    df['Movement Time'] = lst
    saturation_count = df['Movement Time'].str.split(' minutes', n=1).str[0].str.replace(',', '.').astype(float)
    saturation_rate = df['Movement Time'].str.split(' minutes', n=1).str[1].str.replace('%', '').str.replace(',', '.').astype(float)
    df['Movement Time'] = saturation_rate

    #Clean Average Oxygen Saturation Data
    df['Average Oxygen Saturation'] = df['Average Oxygen Saturation'].str.replace(' %', '')
    df['Average Oxygen Saturation'] = df['Average Oxygen Saturation'].str.replace(',', '.').astype(float)

    #Clean Oxygen Desaturation Events Data
    apnea_hypopnea_count = df['Oxygen Desaturation Events'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Oxygen Desaturation Events'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    rate = rate.replace('-', '0')
    apnea_hypopnea_rate = rate.str.replace(',', '.').astype(float)
    df['Oxygen Desaturation Events'] = apnea_hypopnea_rate

    #Clean Autonomic Arousal Data
    apnea_hypopnea_count = df['Autonomic Arousal'].str.split(' ', n=1).str[0].astype(int)
    rate = df['Autonomic Arousal'].str.split(' ', n=1).str[1].str.replace(' / h', '')
    rate = rate.replace('-', '0')
    apnea_hypopnea_rate = rate.str.replace(',', '.').astype(float)
    df['Autonomic Arousal'] = apnea_hypopnea_rate

    #Clean Apenea Table Data
    df['apenea'] = df['apenea'].apply(ast.literal_eval)
    df_apnea_expanded = pd.concat([json_normalize(record) for record in df['apenea']], ignore_index=True)
    apenea_list = []
    obstructive_list = []
    central_list = []
    mixed_list = []
    hypopnea_list = []
    obstructive_hypopnea_list = []
    central_hypopnea_list = []
    mixed_hypopnea_list = []

    apenea_list1 = []
    obstructive_list1 = []
    central_list1 = []
    mixed_list1 = []
    hypopnea_list1 = []
    obstructive_hypopnea_list1 = []
    central_hypopnea_list1 = []
    mixed_hypopnea_list1 = []

    lists_number = [
        apenea_list,
        obstructive_list,
        central_list,
        mixed_list,
        hypopnea_list,
        obstructive_hypopnea_list,
        central_hypopnea_list,
        mixed_hypopnea_list
    ]

    lists_Mean = [
        apenea_list1,
        obstructive_list1,
        central_list1,
        mixed_list1,
        hypopnea_list1,
        obstructive_hypopnea_list1,
        central_hypopnea_list1,
        mixed_hypopnea_list1
    ]

    for i in range(10):
        for j, lst in enumerate(lists_number):
            if df['apenea'][i][j]['Number'] == '-':
                lst.append(0)
            else:
                lst.append(int(df['apenea'][i][j]['Number']))

    for i in range(10):
        for j, lst in enumerate(lists_Mean):
            if df['apenea'][i][j]['Mean [seconds]'] == '-':
                lst.append(0)
            else:
                value = df['apenea'][i][j]['Mean [seconds]'].replace(',', '.')
                lst.append(float(value))

    for i in range(8):
        name = f"apenea_{df['apenea'][0][i]['Respiration']}_Number"
        df[name] = lists_number[i]

    for i in range(8):
        name = f"apenea_{df['apenea'][0][i]['Respiration']}_Mean [seconds]"
        df[name] = lists_Mean[i]

Análise dos Comentários **Polysomnography**

In [None]:
def analysis_comments_polysomnography(df):
    ner_pipeline = pipeline('ner', model='portugueseNLP/medialbertina_pt-pt_900m_NER', aggregation_strategy='average')
    for index, sentence in df['comments'].items():
        entities = ner_pipeline(sentence)
        with open('../data/csvs/ner_results_Rem.txt', 'a', encoding='utf-8') as file:
            # Iterate over the detected entities
            file.write(f"File Index {index}:\n")
            for entity in entities:
                # Extract entity details
                entity_text = sentence[entity['start']:entity['end']]
                entity_group = entity['entity_group']
                # Write the entity details to the file
                if entity_group == "Resultado":
                    continue
                file.write(f"{entity_group} - {entity_text}\n")
            file.write("-------------------------------------------------------------------\n")


Análise dos Comentários **Polygraphy**

In [None]:
def analysis_comments_polygraphy(df):
    ner_pipeline = pipeline('ner', model='portugueseNLP/medialbertina_pt-pt_900m_NER', aggregation_strategy='average')
    # Open a file in write mode
    with open('../data/csvs/ner_results_Rem_Poly.txt', 'a', encoding='utf-8') as file:
        for index, sentence in df['comments'].items():
            entities = ner_pipeline(sentence)
            # Iterate over the detected entities
            file.write(f"File Index {index}:\n")
            for entity in entities:
                # Extract entity details
                entity_text = sentence[entity['start']:entity['end']]
                entity_group = entity['entity_group']
                # Write the entity details to the file
                if entity_group == "Diagnostico":
                    file.write(f"{entity_group} - {entity_text}\n")
                    break
            file.write("-------------------------------------------------------------------\n")


Data Loading Polysomnography

In [10]:
df = pd.read_csv('../data/csvs/RemLogicData.csv')
df['position'][0]

"[{'Position': 'Supine', 'Index time': '104,6', 'A or H/h': '29,8'}, {'Position': 'Left', 'Index time': '90,5', 'A or H/h': '2,0'}, {'Position': 'Prone', 'Index time': '0,0', 'A or H/h': '-'}, {'Position': 'Right', 'Index time': '170,4', 'A or H/h': '6,3'}, {'Position': 'Upright', 'Index time': '0,0', 'A or H/h': '-'}, {'Position': 'Unknown', 'Index time': '0,0', 'A or H/h': '-'}]"

Data Loading Polygraphy

In [None]:
df = pd.read_csv('../data/csvs/RemLogicPolyData.csv')
clean_common_data(df)
clean_data_Polygraphy(df)
#analysis_comments_polygraphy(df)
df = df.rename(columns={'apenea_Hypopnea (All):_Number': 'apenea_Hypopnea (All)_Number'})
df.columns.to_list()

Análise do Tempo dos Exames

In [None]:
univariate_analysis(df, 'total_recording_time')

Análise da "Lowest Oxygen Saturation"

In [None]:
univariate_analysis(df, 'Lowest Oxygen Saturation')

Análise de "Apnea + Hypopnea (A+H)"

In [None]:
univariate_analysis(df, 'Apnea + Hypopnea (A+H) Rate')

Análise Bivariável

In [None]:
numeric_cols = df.select_dtypes(include=np.number).columns
numeric_cols_list = list(numeric_cols)
numeric_cols_list.remove('ID')
numeric_cols = pd.Index(numeric_cols_list)

#plot histograms for all numeric columns
"""
for col in numeric_cols:
    plt.figure(figsize=(5, 5))
    sns.histplot(df[col])
    plt.title(col)
    plt.show
"""

#plot correlation matrix
corr = df[numeric_cols].corr()
plt.figure(figsize=(30, 20))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.show()

#### RemLogic Notes
Após a análise deste doas conseguimos perceber que o "total_recording_time", "Sleep Efficiency" e "Apnea + Hypopnea (A+H)" não tem qualquer relação entre si, tal como "Apnea + Hypopnea (A+H)" e "Number of Snoring Episodes". Já o "Sleep Efficiency" e "Apnea + Hypopnea (A+H)" tal como, "Lowest Oxygen Saturation", "Apnea + Hypopnea (A+H)" e "Relative Snoring Time" tem relações fortes entre si.

### Data loading

In [None]:
df = pd.read_csv('../data/csvs/report_summary.csv')
df = df.replace('---', pd.NaT)

fix_data_types(df)
handle_missing_values(df)
