In [1]:
import numpy as np
import pandas as pd
import wfdb
from fancyimpute import KNN
import matplotlib.pyplot as plt

# Подгрузка данных

In [5]:
kaggle_formatted = pd.read_csv('../aiijc_data/kaggle_formatted/train_meta.csv')
kaggle_siganls = pd.read_csv('../aiijc_data/kaggle_formatted/train_signal.csv')

ptbxl = pd.read_csv('../aiijc_data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/ptbxl_database.csv')

aiijc = pd.read_csv('../aiijc_data/task_1/train/train_meta.csv')
aiijc_labels = pd.read_csv('../aiijc_data/task_1/train/train_gts.csv')

aiijc_test = pd.read_csv('../aiijc_data/task_1/test/test_meta.csv')

# Обработка

In [81]:
kaggle_formatted = kaggle_formatted.loc[(kaggle_formatted.NORM == 1) | (kaggle_formatted.MI == 1)]
kaggle_formatted = kaggle_formatted.loc[:, ['ecg_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site', 'device', 'MI']]

In [82]:
ptbxl = ptbxl[ptbxl.ecg_id.isin(kaggle_formatted.ecg_id)]
kaggle_formatted = kaggle_formatted[kaggle_formatted.ecg_id.isin(ptbxl.ecg_id)]

In [83]:
kaggle_formatted = kaggle_formatted.merge(ptbxl[['filename_hr', 'ecg_id']], on='ecg_id', how='left')

In [84]:
aiijc = aiijc.merge(aiijc_labels, on='record_name', how='left')
aiijc = aiijc[~aiijc.ecg_id.isin(kaggle_formatted.ecg_id)]
aiijc = aiijc.rename({'myocard': 'MI'}, axis=1)

# Создание объединненого датасета

In [76]:
new_data = pd.concat([kaggle_formatted, aiijc[['ecg_id', 'age', 'sex', 'height', 'weight', 'nurse', 'site', 'device', 'filename_hr', 'MI']]])

In [77]:
new_data['record_name'] = new_data['filename_hr'].apply(lambda x: x[-8:])

In [73]:
new_data_labels = new_data[['record_name', 'MI']].rename({'MI': 'myocard'}, axis=1)
new_data = new_data.drop(["MI"], axis=1)

# Заполнение пропуском с использованием KNN

In [333]:
new_data['nurse'] = new_data['nurse'].fillna(int(new_data.nurse.mode()))
new_data['site'] = new_data['site'].fillna(int(new_data.site.mode()))

In [335]:
KNN_imputer = KNN()
new_data.loc[:, ['sex', 'age', 'height', 'weight', 'nurse', 'site']] = KNN_imputer.fit_transform(new_data[ ['sex', 'age', 'height', 'weight', 'nurse', 'site']])

Imputing row 1/12931 with 1 missing, elapsed time: 39.640
Imputing row 101/12931 with 2 missing, elapsed time: 39.664
Imputing row 201/12931 with 1 missing, elapsed time: 39.672
Imputing row 301/12931 with 0 missing, elapsed time: 39.687
Imputing row 401/12931 with 0 missing, elapsed time: 39.697
Imputing row 501/12931 with 1 missing, elapsed time: 39.706
Imputing row 601/12931 with 0 missing, elapsed time: 39.714
Imputing row 701/12931 with 1 missing, elapsed time: 39.730
Imputing row 801/12931 with 2 missing, elapsed time: 39.737
Imputing row 901/12931 with 0 missing, elapsed time: 39.753
Imputing row 1001/12931 with 1 missing, elapsed time: 39.761
Imputing row 1101/12931 with 0 missing, elapsed time: 39.769
Imputing row 1201/12931 with 0 missing, elapsed time: 39.790
Imputing row 1301/12931 with 1 missing, elapsed time: 39.810
Imputing row 1401/12931 with 0 missing, elapsed time: 39.826
Imputing row 1501/12931 with 0 missing, elapsed time: 39.834
Imputing row 1601/12931 with 0 missi

# Создание папки исходного датасета

In [347]:
aiijc = pd.read_csv('../aiijc_data/task_1/train/train_meta.csv')

In [353]:
aiijc = aiijc[['age', 'sex', 'height', 'weight', 'nurse', 'site', 'device', 'record_name']]

In [356]:
aiijc.loc[:, ['sex', 'age', 'height', 'weight', 'nurse', 'site']] = KNN_imputer.fit_transform(aiijc[['sex', 'age', 'height', 'weight', 'nurse', 'site']])

Imputing row 1/2101 with 1 missing, elapsed time: 0.559
Imputing row 101/2101 with 2 missing, elapsed time: 0.562
Imputing row 201/2101 with 0 missing, elapsed time: 0.563
Imputing row 301/2101 with 0 missing, elapsed time: 0.565
Imputing row 401/2101 with 0 missing, elapsed time: 0.569
Imputing row 501/2101 with 2 missing, elapsed time: 0.571
Imputing row 601/2101 with 2 missing, elapsed time: 0.574
Imputing row 701/2101 with 2 missing, elapsed time: 0.577
Imputing row 801/2101 with 0 missing, elapsed time: 0.579
Imputing row 901/2101 with 2 missing, elapsed time: 0.582
Imputing row 1001/2101 with 2 missing, elapsed time: 0.585
Imputing row 1101/2101 with 2 missing, elapsed time: 0.588
Imputing row 1201/2101 with 0 missing, elapsed time: 0.590
Imputing row 1301/2101 with 2 missing, elapsed time: 0.594
Imputing row 1401/2101 with 0 missing, elapsed time: 0.596
Imputing row 1501/2101 with 0 missing, elapsed time: 0.600
Imputing row 1601/2101 with 2 missing, elapsed time: 0.602
Imputing 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aiijc.loc[:, ['sex', 'age', 'height', 'weight', 'nurse', 'site']] = KNN_imputer.fit_transform(aiijc[['sex', 'age', 'height', 'weight', 'nurse', 'site']])


In [360]:
aiijc.to_csv('../aiijc_data/aiijc_formatted/meta.csv', index=False)

In [423]:
aiijc_labels.to_csv('../aiijc_data/aiijc_formatted/labels.csv', index=False)

# Создание папки увеличенного датасета

In [387]:
filenames_hr = list(new_data.filename_hr)

In [406]:
from tqdm import tqdm
for i in tqdm(filenames_hr):
    hr = wfdb.rdsamp('../aiijc_data/ptb-xl-a-large-publicly-available-electrocardiography-dataset-1.0.3/'+i)[0].T
    np.save('../aiijc_data/big_formatted/signals/'+i[-8:], hr)

100%|█████████████████████████████████████████████████| 12931/12931 [04:37<00:00, 46.67it/s]


In [2]:
hr1 = np.load('../aiijc_data/task_1/train/00749_hr.npy')
hr2 = np.load('../aiijc_data/big_formatted/signals/00749_hr.npy')

In [415]:
new_data = new_data.drop(['ecg_id', 'filename_hr'], axis=1)

In [421]:
new_data.to_csv('../aiijc_data/big_formatted/meta.csv', index=False)

In [422]:
new_data_labels.to_csv('../aiijc_data/big_formatted/labels.csv', index=False)