# FEATURE ENGINEERING

In [1]:
#Import Essential Libraries
import pandas as pd
import numpy as np

In [2]:
#Import the clean patient data and ecg_data
patient_df = pd.read_csv(r'clean_patient_info.csv', index_col= 0)
ecg_df = pd.read_csv(r'ecg_data.csv', index_col= 0)

In [3]:
patient_df.head()

Unnamed: 0,diagnosi,ecg_id,ritmi,patient_id,age,sex,height,weight,recording_date,heart_axis,second_opinion,initial_autogenerated_report,validated_by_human,electrodes_problems,Age_Group,Height_Group,Weight_Group
0,STACH,10900,VA,15654.0,54.0,0,166.796356,69.841845,1993-09-01 11:31:17,MID,False,True,False,no,"(0, 55]","(166, 167]","(60, 70]"
1,AFLT,10900,AF,15654.0,54.0,0,166.796356,69.841845,1993-09-01 11:31:17,MID,False,True,False,no,"(0, 55]","(166, 167]","(60, 70]"
2,SR,8209,SR,12281.0,55.0,0,166.796356,69.841845,1992-06-09 15:52:36,LAD,False,False,True,no,"(0, 55]","(166, 167]","(60, 70]"
3,STACH,17620,VA,2007.0,29.0,1,164.0,56.0,1997-02-08 18:33:30,,False,False,True,no,"(0, 55]","(90, 166]","(50, 60]"
4,SBRAD,12967,VA,8685.0,57.0,0,166.796356,69.841845,1994-09-13 10:21:14,MID,False,True,False,no,"(55, 70]","(166, 167]","(60, 70]"


Drop all features except age, Age_Group, validated by humans, electrodes problems.

Drop ecg_id and patient_id --> These are just identification numbers and should not impact the outcome
Drop height and weight --> These have lots of missing data and using the mean/median for missing values may impact result and slow down the model
Drop recording_data --> should not impact the result of the data.
Drop heart_axis --> EDA shows no impact on results
Drop second_opnion --> only 108 points have second opinion

initial_autogenerated and validated_by_human should not impact the condition on of the patient but may help in confirming the results. Therefore, these two features will not be use for training but will be used to separate the data set in to groups before training.

In [4]:
df = patient_df[['ritmi', 'diagnosi', 'age', 'Age_Group', 'initial_autogenerated_report', 'validated_by_human', 'electrodes_problems']]


In [5]:
df.head()

Unnamed: 0,ritmi,diagnosi,age,Age_Group,initial_autogenerated_report,validated_by_human,electrodes_problems
0,VA,STACH,54.0,"(0, 55]",True,False,no
1,AF,AFLT,54.0,"(0, 55]",True,False,no
2,SR,SR,55.0,"(0, 55]",False,True,no
3,VA,STACH,29.0,"(0, 55]",False,True,no
4,VA,SBRAD,57.0,"(55, 70]",True,False,no


In [6]:
df.reset_index(inplace= True)
ecg_df.reset_index(inplace= True)

In [7]:
print(f'The shape of the patient features is: {df.shape}')
print(f'The shape of the ecg features is: {ecg_df.shape}')

The shape of the patient features is: (6428, 8)
The shape of the ecg features is: (6428, 9601)


Both patient and ecg have the same number of rows. Will mearge the two make one dataframe for the model

In [8]:
df = pd.merge(left= df, right= ecg_df, on= 'index')
df.head()

Unnamed: 0,index,ritmi,diagnosi,age,Age_Group,initial_autogenerated_report,validated_by_human,electrodes_problems,0,1,...,9590,9591,9592,9593,9594,9595,9596,9597,9598,9599
0,0,VA,STACH,54.0,"(0, 55]",True,False,no,-0.005,-0.005,...,-0.095,-0.092,-0.086,-0.102,-0.113,-0.1,-0.128,-0.142,-0.139,-0.141
1,1,AF,AFLT,54.0,"(0, 55]",True,False,no,-0.005,-0.005,...,-0.095,-0.092,-0.086,-0.102,-0.113,-0.1,-0.128,-0.142,-0.139,-0.141
2,2,SR,SR,55.0,"(0, 55]",False,True,no,-0.17,-0.17,...,0.025,0.029,0.03,0.03,0.03,0.03,0.026,0.024,0.026,0.024
3,3,VA,STACH,29.0,"(0, 55]",False,True,no,0.0,0.0,...,-0.105,-0.089,-0.116,-0.142,-0.139,-0.14,-0.144,-0.145,-0.147,-0.15
4,4,VA,SBRAD,57.0,"(55, 70]",True,False,no,0.015,0.015,...,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13


In [9]:
#drop rows with electrod problems and drop electrode problems column
df = df[df.electrodes_problems == 'no']
df.drop(columns= ['index', 'electrodes_problems'], inplace= True)

In [10]:
df.columns

Index(['ritmi', 'diagnosi', 'age', 'Age_Group', 'initial_autogenerated_report',
       'validated_by_human', '0', '1', '2', '3',
       ...
       '9590', '9591', '9592', '9593', '9594', '9595', '9596', '9597', '9598',
       '9599'],
      dtype='object', length=9606)

In [11]:
label_df = df[['ritmi', 'diagnosi']]
feature_df = df.drop(columns= ['ritmi', 'diagnosi'])

In [12]:
label_df.head()

Unnamed: 0,ritmi,diagnosi
0,VA,STACH
1,AF,AFLT
2,SR,SR
3,VA,STACH
4,VA,SBRAD


In [13]:
feature_df.head()

Unnamed: 0,age,Age_Group,initial_autogenerated_report,validated_by_human,0,1,2,3,4,5,...,9590,9591,9592,9593,9594,9595,9596,9597,9598,9599
0,54.0,"(0, 55]",True,False,-0.005,-0.005,-0.005,-0.005,-0.005,-0.005,...,-0.095,-0.092,-0.086,-0.102,-0.113,-0.1,-0.128,-0.142,-0.139,-0.141
1,54.0,"(0, 55]",True,False,-0.005,-0.005,-0.005,-0.005,-0.005,-0.005,...,-0.095,-0.092,-0.086,-0.102,-0.113,-0.1,-0.128,-0.142,-0.139,-0.141
2,55.0,"(0, 55]",False,True,-0.17,-0.17,-0.17,-0.17,-0.17,-0.17,...,0.025,0.029,0.03,0.03,0.03,0.03,0.026,0.024,0.026,0.024
3,29.0,"(0, 55]",False,True,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.105,-0.089,-0.116,-0.142,-0.139,-0.14,-0.144,-0.145,-0.147,-0.15
4,57.0,"(55, 70]",True,False,0.015,0.015,0.015,0.015,0.015,0.015,...,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13,-0.13


In [14]:
validated_by_human_df = feature_df[feature_df.validated_by_human == True]
Not_validated_by_human_df = feature_df[feature_df.validated_by_human == False]