In [1]:
import numpy as np
import pandas as pd
import gzip

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Demographic Data

In [3]:
df = pd.read_csv('/content/drive/Shareddrives/OG_Mood_Lyric_Processors_CS272/CS284A/archive/coorteeqsrafva.csv', sep=';', header=0, index_col=0)

print(df.shape)
df.head()

(6428, 30)


Unnamed: 0,diagnosi,ecg_id,ritmi,patient_id,age,sex,height,weight,nurse,site,...,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,strat_fold,filename_lr,filename_hr
0,STACH,10900,VA,15654.0,54.0,0,,,0.0,0.0,...,False,,,,,,,6,records100/10000/10900_lr,records500/10000/10900_hr
1,AFLT,10900,AF,15654.0,54.0,0,,,0.0,0.0,...,False,,,,,,,6,records100/10000/10900_lr,records500/10000/10900_hr
2,SR,8209,SR,12281.0,55.0,0,,,1.0,2.0,...,True,,,,,,,10,records100/08000/08209_lr,records500/08000/08209_hr
3,STACH,17620,VA,2007.0,29.0,1,164.0,56.0,7.0,1.0,...,True,,,,,,,1,records100/17000/17620_lr,records500/17000/17620_hr
4,SBRAD,12967,VA,8685.0,57.0,0,,,0.0,0.0,...,False,,", I-AVR,",,,,,1,records100/12000/12967_lr,records500/12000/12967_hr


In [4]:
sub_df = df.drop(columns=['diagnosi', 'patient_id', 'ecg_id', 'recording_date', 'report', 'scp_codes', 'infarction_stadium1', 'infarction_stadium2', 'initial_autogenerated_report', 'baseline_drift', 'static_noise', 'burst_noise', 'electrodes_problems', 'extra_beats', 'filename_lr', 'filename_hr'])
sub_df.head()

Unnamed: 0,ritmi,age,sex,height,weight,nurse,site,device,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold
0,VA,54.0,0,,,0.0,0.0,CS100 3,MID,,False,False,,6
1,AF,54.0,0,,,0.0,0.0,CS100 3,MID,,False,False,,6
2,SR,55.0,0,,,1.0,2.0,CS-12,LAD,1.0,False,True,,10
3,VA,29.0,1,164.0,56.0,7.0,1.0,AT-6 C 5.6,,0.0,False,True,,1
4,VA,57.0,0,,,0.0,0.0,CS100 3,MID,,False,False,,1


In [5]:
# fill missing values with mean values for the age, height, weight columns
sub_df['age'].fillna(value=sub_df['age'].mean(), inplace=True)
sub_df['height'].fillna(value=sub_df['height'].mean(), inplace=True)
sub_df['weight'].fillna(value=sub_df['weight'].mean(), inplace=True)

# fill missing values with 0 for these columns
sub_df['nurse'] = sub_df['nurse'].fillna(0)
sub_df['site'] = sub_df['site'].fillna(0)
sub_df['validated_by'] = sub_df['validated_by'].fillna(0)
sub_df['heart_axis'] = sub_df['heart_axis'].fillna('Missing')
sub_df['pacemaker'] = sub_df['pacemaker'].fillna('Missing')

In [6]:
sub_df['ritmi'] = sub_df['ritmi'].map({'SR': 0, 'AF': 1, 'VA': 2}).values
print(sub_df.shape)
sub_df.head()

(6428, 14)


Unnamed: 0,ritmi,age,sex,height,weight,nurse,site,device,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold
0,2,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6
1,1,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6
2,0,55.0,0,166.796356,69.841845,1.0,2.0,CS-12,LAD,1.0,False,True,Missing,10
3,2,29.0,1,164.0,56.0,7.0,1.0,AT-6 C 5.6,Missing,0.0,False,True,Missing,1
4,2,57.0,0,166.796356,69.841845,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,1


## ECG data

In [7]:
ecgeq_arr = np.load('/content/drive/Shareddrives/OG_Mood_Lyric_Processors_CS272/CS284A/archive/ecgeq-500hzsrfava.npy')
print(ecgeq_arr.shape)
ecgeq_arr

(6428, 5000, 12)


array([[[-0.005,  0.135,  0.14 , ..., -0.21 , -0.145, -0.08 ],
        [-0.005,  0.135,  0.14 , ..., -0.21 , -0.145, -0.08 ],
        [-0.005,  0.135,  0.14 , ..., -0.21 , -0.145, -0.08 ],
        ...,
        [ 0.03 , -0.045, -0.075, ..., -0.02 , -0.035, -0.045],
        [ 0.03 , -0.045, -0.075, ..., -0.02 , -0.035, -0.045],
        [ 0.03 , -0.045, -0.075, ..., -0.02 , -0.035, -0.045]],

       [[-0.005,  0.135,  0.14 , ..., -0.21 , -0.145, -0.08 ],
        [-0.005,  0.135,  0.14 , ..., -0.21 , -0.145, -0.08 ],
        [-0.005,  0.135,  0.14 , ..., -0.21 , -0.145, -0.08 ],
        ...,
        [ 0.03 , -0.045, -0.075, ..., -0.02 , -0.035, -0.045],
        [ 0.03 , -0.045, -0.075, ..., -0.02 , -0.035, -0.045],
        [ 0.03 , -0.045, -0.075, ..., -0.02 , -0.035, -0.045]],

       [[-0.17 , -0.13 ,  0.04 , ..., -0.14 , -0.05 , -0.03 ],
        [-0.17 , -0.13 ,  0.04 , ..., -0.14 , -0.05 , -0.03 ],
        [-0.17 , -0.13 ,  0.04 , ..., -0.14 , -0.05 , -0.03 ],
        ...,
        [ 0.

In [8]:
ecgeq_arr = ecgeq_arr[:,:700,:]

In [9]:
m,n,r = ecgeq_arr.shape
print(m,n,r)

6428 700 12


In [10]:
m,n,r = ecgeq_arr.shape
out_arr = np.column_stack((np.repeat(np.arange(m),n),ecgeq_arr.reshape(m*n,-1)))
out_df = pd.DataFrame(out_arr)

In [11]:
out_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,-0.145,-0.080
1,0.0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,-0.145,-0.080
2,0.0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,-0.145,-0.080
3,0.0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,-0.145,-0.080
4,0.0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.211,-0.146,-0.080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4499595,6427.0,0.010,0.170,0.16,-0.090,-0.075,0.165,0.155,0.365,0.230,0.030,-0.065,-0.060
4499596,6427.0,0.014,0.174,0.16,-0.094,-0.073,0.167,0.155,0.368,0.245,0.029,-0.057,-0.056
4499597,6427.0,0.016,0.176,0.16,-0.096,-0.073,0.167,0.155,0.383,0.261,0.040,-0.052,-0.055
4499598,6427.0,0.014,0.174,0.16,-0.094,-0.073,0.167,0.155,0.406,0.282,0.059,-0.046,-0.053


In [12]:
out_df.columns= ['index', 'I', 'II', 'III', 'aVF', 'aVR', 'aVL', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
out_df['index'] = out_df['index'].astype('int32')
#out_df = out_df.drop_duplicates()
out_df

Unnamed: 0,index,I,II,III,aVF,aVR,aVL,V1,V2,V3,V4,V5,V6
0,0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,-0.145,-0.080
1,0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,-0.145,-0.080
2,0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,-0.145,-0.080
3,0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,-0.145,-0.080
4,0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.211,-0.146,-0.080
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4499595,6427,0.010,0.170,0.16,-0.090,-0.075,0.165,0.155,0.365,0.230,0.030,-0.065,-0.060
4499596,6427,0.014,0.174,0.16,-0.094,-0.073,0.167,0.155,0.368,0.245,0.029,-0.057,-0.056
4499597,6427,0.016,0.176,0.16,-0.096,-0.073,0.167,0.155,0.383,0.261,0.040,-0.052,-0.055
4499598,6427,0.014,0.174,0.16,-0.094,-0.073,0.167,0.155,0.406,0.282,0.059,-0.046,-0.053


In [13]:
sub_df['unique_id'] = np.arange(sub_df.shape[0])
sub_df

Unnamed: 0,ritmi,age,sex,height,weight,nurse,site,device,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold,unique_id
0,2,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
1,1,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,1
2,0,55.0,0,166.796356,69.841845,1.0,2.0,CS-12,LAD,1.0,False,True,Missing,10,2
3,2,29.0,1,164.000000,56.000000,7.0,1.0,AT-6 C 5.6,Missing,0.0,False,True,Missing,1,3
4,2,57.0,0,166.796356,69.841845,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6423,2,81.0,0,178.000000,70.000000,11.0,1.0,AT-6 6,LAD,0.0,False,True,Missing,4,6423
6424,2,88.0,0,152.000000,45.000000,11.0,1.0,AT-6 6,Missing,0.0,False,True,Missing,10,6424
6425,0,83.0,1,166.796356,69.841845,1.0,2.0,CS-12,LAD,1.0,False,True,Missing,5,6425
6426,1,75.0,1,177.000000,80.000000,0.0,34.0,AT-6 C 5.5,Missing,2.0,False,True,Missing,7,6426


## Merged ECG + Demographic Data

In [14]:
merged_df = pd.merge(out_df, sub_df, how='left', left_on='index', right_on='unique_id')
merged_df = merged_df.drop(columns=['index'])
merged_df

Unnamed: 0,I,II,III,aVF,aVR,aVL,V1,V2,V3,V4,...,nurse,site,device,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold,unique_id
0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
1,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
2,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
3,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
4,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.211,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4499595,0.010,0.170,0.16,-0.090,-0.075,0.165,0.155,0.365,0.230,0.030,...,1.0,2.0,CS-12,MID,1.0,False,True,Missing,8,6427
4499596,0.014,0.174,0.16,-0.094,-0.073,0.167,0.155,0.368,0.245,0.029,...,1.0,2.0,CS-12,MID,1.0,False,True,Missing,8,6427
4499597,0.016,0.176,0.16,-0.096,-0.073,0.167,0.155,0.383,0.261,0.040,...,1.0,2.0,CS-12,MID,1.0,False,True,Missing,8,6427
4499598,0.014,0.174,0.16,-0.094,-0.073,0.167,0.155,0.406,0.282,0.059,...,1.0,2.0,CS-12,MID,1.0,False,True,Missing,8,6427


In [15]:
#new_mdf = merged_df.dropna()
merged_df = merged_df.fillna(0)
new_mdf = merged_df.reset_index(drop=True)
new_mdf.columns

Index(['I', 'II', 'III', 'aVF', 'aVR', 'aVL', 'V1', 'V2', 'V3', 'V4', 'V5',
       'V6', 'ritmi', 'age', 'sex', 'height', 'weight', 'nurse', 'site',
       'device', 'heart_axis', 'validated_by', 'second_opinion',
       'validated_by_human', 'pacemaker', 'strat_fold', 'unique_id'],
      dtype='object')

In [16]:
new_mdf.head()

Unnamed: 0,I,II,III,aVF,aVR,aVL,V1,V2,V3,V4,...,nurse,site,device,heart_axis,validated_by,second_opinion,validated_by_human,pacemaker,strat_fold,unique_id
0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.09,-0.11,-0.21,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
1,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.09,-0.11,-0.21,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
2,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.09,-0.11,-0.21,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
3,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.09,-0.11,-0.21,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0
4,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.09,-0.11,-0.211,...,0.0,0.0,CS100 3,MID,0.0,False,False,Missing,6,0


In [17]:
unique_values = new_mdf['unique_id'].unique()
unique_values.shape

(6428,)

In [18]:
temp_df = new_mdf.drop(columns=[ 'heart_axis', 'validated_by', 'second_opinion','validated_by_human', 'pacemaker', 'strat_fold'])

In [19]:
temp_df

Unnamed: 0,I,II,III,aVF,aVR,aVL,V1,V2,V3,V4,...,V6,ritmi,age,sex,height,weight,nurse,site,device,unique_id
0,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,...,-0.080,2,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3,0
1,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,...,-0.080,2,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3,0
2,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,...,-0.080,2,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3,0
3,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.210,...,-0.080,2,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3,0
4,-0.005,0.135,0.14,-0.065,-0.073,0.137,-0.125,-0.090,-0.110,-0.211,...,-0.080,2,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4499595,0.010,0.170,0.16,-0.090,-0.075,0.165,0.155,0.365,0.230,0.030,...,-0.060,0,27.0,0,166.796356,69.841845,1.0,2.0,CS-12,6427
4499596,0.014,0.174,0.16,-0.094,-0.073,0.167,0.155,0.368,0.245,0.029,...,-0.056,0,27.0,0,166.796356,69.841845,1.0,2.0,CS-12,6427
4499597,0.016,0.176,0.16,-0.096,-0.073,0.167,0.155,0.383,0.261,0.040,...,-0.055,0,27.0,0,166.796356,69.841845,1.0,2.0,CS-12,6427
4499598,0.014,0.174,0.16,-0.094,-0.073,0.167,0.155,0.406,0.282,0.059,...,-0.053,0,27.0,0,166.796356,69.841845,1.0,2.0,CS-12,6427


In [20]:
result = temp_df.groupby('unique_id')[['I', 'II', 'III', 'aVF', 'aVR', 'aVL', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6']].agg(lambda x: x.tolist())

# If you want the result as an array
result_array = result.values


In [21]:
result['concatenated_values'] = result.apply(lambda row: sum(row, []), axis=1)

In [22]:
result

Unnamed: 0_level_0,I,II,III,aVF,aVR,aVL,V1,V2,V3,V4,V5,V6,concatenated_values
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,"[-0.005, -0.005, -0.005, -0.005, -0.005, -0.00...","[0.135, 0.135, 0.135, 0.135, 0.135, 0.135, 0.1...","[0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.136, 0....","[-0.065, -0.065, -0.065, -0.065, -0.065, -0.06...","[-0.073, -0.073, -0.073, -0.073, -0.073, -0.07...","[0.137, 0.137, 0.137, 0.137, 0.137, 0.137, 0.1...","[-0.125, -0.125, -0.125, -0.125, -0.125, -0.12...","[-0.09, -0.09, -0.09, -0.09, -0.09, -0.09, -0....","[-0.11, -0.11, -0.11, -0.11, -0.11, -0.11, -0....","[-0.21, -0.21, -0.21, -0.21, -0.211, -0.21, -0...","[-0.145, -0.145, -0.145, -0.145, -0.146, -0.14...","[-0.08, -0.08, -0.08, -0.08, -0.08, -0.08, -0....","[-0.005, -0.005, -0.005, -0.005, -0.005, -0.00..."
1,"[-0.005, -0.005, -0.005, -0.005, -0.005, -0.00...","[0.135, 0.135, 0.135, 0.135, 0.135, 0.135, 0.1...","[0.14, 0.14, 0.14, 0.14, 0.14, 0.14, 0.136, 0....","[-0.065, -0.065, -0.065, -0.065, -0.065, -0.06...","[-0.073, -0.073, -0.073, -0.073, -0.073, -0.07...","[0.137, 0.137, 0.137, 0.137, 0.137, 0.137, 0.1...","[-0.125, -0.125, -0.125, -0.125, -0.125, -0.12...","[-0.09, -0.09, -0.09, -0.09, -0.09, -0.09, -0....","[-0.11, -0.11, -0.11, -0.11, -0.11, -0.11, -0....","[-0.21, -0.21, -0.21, -0.21, -0.211, -0.21, -0...","[-0.145, -0.145, -0.145, -0.145, -0.146, -0.14...","[-0.08, -0.08, -0.08, -0.08, -0.08, -0.08, -0....","[-0.005, -0.005, -0.005, -0.005, -0.005, -0.00..."
2,"[-0.17, -0.17, -0.17, -0.17, -0.17, -0.17, -0....","[-0.13, -0.13, -0.13, -0.13, -0.13, -0.13, -0....","[0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.0...","[0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.1...","[-0.105, -0.105, -0.105, -0.105, -0.105, -0.10...","[-0.045, -0.045, -0.045, -0.045, -0.045, -0.04...","[-0.045, -0.045, -0.045, -0.045, -0.045, -0.04...","[-0.06, -0.06, -0.06, -0.06, -0.06, -0.06, -0....","[0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.0...","[-0.14, -0.14, -0.14, -0.14, -0.14, -0.14, -0....","[-0.05, -0.05, -0.05, -0.05, -0.05, -0.05, -0....","[-0.03, -0.03, -0.03, -0.03, -0.03, -0.03, -0....","[-0.17, -0.17, -0.17, -0.17, -0.17, -0.17, -0...."
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008, 0.013, 0...","[-0.08, -0.08, -0.08, -0.08, -0.08, -0.08, -0....","[-0.08, -0.08, -0.08, -0.08, -0.08, -0.08, -0....","[0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.032, 0....","[0.04, 0.04, 0.04, 0.04, 0.04, 0.04, 0.044, 0....","[-0.08, -0.08, -0.08, -0.08, -0.08, -0.08, -0....","[0.075, 0.075, 0.075, 0.075, 0.075, 0.075, 0.0...","[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.014, 0....","[-0.015, -0.015, -0.015, -0.015, -0.015, -0.01...","[-0.085, -0.085, -0.085, -0.085, -0.085, -0.08...","[-0.195, -0.195, -0.195, -0.195, -0.195, -0.19...","[-0.165, -0.165, -0.165, -0.165, -0.165, -0.16...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008, 0.013, 0..."
4,"[0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.0...","[0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.0...","[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.006, 0....","[-0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0....","[0.002, 0.002, 0.002, 0.002, 0.002, 0.002, 0.0...","[0.017, 0.017, 0.017, 0.017, 0.017, 0.017, 0.0...","[0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.024, 0....","[0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.003, -0...","[-0.025, -0.025, -0.025, -0.025, -0.025, -0.02...","[-0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0....","[-0.04, -0.04, -0.04, -0.04, -0.04, -0.04, -0....","[0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6423,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.001, 0....","[-0.06, -0.06, -0.06, -0.06, -0.06, -0.06, -0....","[-0.06, -0.06, -0.06, -0.06, -0.06, -0.06, -0....","[0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.0...","[0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.03, 0.0...","[-0.06, -0.06, -0.06, -0.06, -0.06, -0.06, -0....","[0.085, 0.085, 0.085, 0.085, 0.085, 0.085, 0.0...","[-0.06, -0.06, -0.06, -0.06, -0.06, -0.06, -0....","[-0.045, -0.045, -0.045, -0.045, -0.045, -0.04...","[-0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0....","[0.28, 0.28, 0.28, 0.28, 0.28, 0.28, 0.28, 0.2...","[-0.06, -0.06, -0.06, -0.06, -0.06, -0.06, -0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.001, 0...."
6424,"[-0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0....","[0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.042, 0....","[0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.0...","[-0.015, -0.015, -0.015, -0.015, -0.015, -0.01...","[-0.045, -0.045, -0.045, -0.045, -0.045, -0.04...","[0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.056, 0....","[-0.285, -0.285, -0.285, -0.285, -0.285, -0.28...","[-2.21, -2.21, -2.21, -2.21, -2.21, -2.21, -2....","[-2.35, -2.35, -2.35, -2.35, -2.35, -2.35, -2....","[0.07, 0.07, 0.07, 0.07, 0.07, 0.07, 0.066, 0....","[0.135, 0.135, 0.135, 0.135, 0.135, 0.135, 0.1...","[0.145, 0.145, 0.145, 0.145, 0.145, 0.145, 0.1...","[-0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0...."
6425,"[0.095, 0.095, 0.095, 0.094, 0.097, 0.095, 0.0...","[0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.0...","[-0.07, -0.07, -0.07, -0.069, -0.072, -0.07, -...","[-0.06, -0.06, -0.06, -0.059, -0.061, -0.06, -...","[0.082, 0.082, 0.082, 0.081, 0.084, 0.082, 0.0...","[-0.022, -0.022, -0.022, -0.022, -0.023, -0.02...","[-0.065, -0.065, -0.065, -0.065, -0.065, -0.06...","[0.025, 0.025, 0.025, 0.025, 0.025, 0.025, 0.0...","[0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.0...","[0.135, 0.135, 0.135, 0.135, 0.135, 0.135, 0.1...","[0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.042, 0....","[0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.056, 0....","[0.095, 0.095, 0.095, 0.094, 0.097, 0.095, 0.0..."
6426,"[-0.145, -0.145, -0.145, -0.145, -0.145, -0.14...","[0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.1...","[0.295, 0.295, 0.295, 0.295, 0.295, 0.295, 0.2...","[-0.003, -0.003, -0.003, -0.003, -0.003, -0.00...","[-0.22, -0.22, -0.22, -0.22, -0.22, -0.22, -0....","[0.222, 0.222, 0.222, 0.222, 0.222, 0.222, 0.2...","[0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.1...","[0.485, 0.485, 0.485, 0.485, 0.485, 0.485, 0.4...","[0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.397, 0.3...","[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.296, 0.295, 0...","[0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.06, 0.0...","[0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.024, 0....","[-0.145, -0.145, -0.145, -0.145, -0.145, -0.14..."


In [23]:
label_df = temp_df[['ritmi','unique_id']]
label_df = label_df.groupby('unique_id')['ritmi'].first().reset_index(name='ritmi')

In [24]:
label_df

Unnamed: 0,unique_id,ritmi
0,0,2
1,1,1
2,2,0
3,3,2
4,4,2
...,...,...
6423,6423,2
6424,6424,2
6425,6425,0
6426,6426,1


In [25]:
other_features_df = temp_df[['age','sex','height','weight','nurse', 'site',
                             'device']]

In [26]:
other_features_df_modified = temp_df[['age','sex','height','weight','nurse', 'site',
                             'device', 'unique_id']].groupby('unique_id').first().reset_index()

In [27]:
other_features_df_modified = other_features_df_modified.drop(columns = ['unique_id'])

In [28]:
other_features_df_modified

Unnamed: 0,age,sex,height,weight,nurse,site,device
0,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3
1,54.0,0,166.796356,69.841845,0.0,0.0,CS100 3
2,55.0,0,166.796356,69.841845,1.0,2.0,CS-12
3,29.0,1,164.000000,56.000000,7.0,1.0,AT-6 C 5.6
4,57.0,0,166.796356,69.841845,0.0,0.0,CS100 3
...,...,...,...,...,...,...,...
6423,81.0,0,178.000000,70.000000,11.0,1.0,AT-6 6
6424,88.0,0,152.000000,45.000000,11.0,1.0,AT-6 6
6425,83.0,1,166.796356,69.841845,1.0,2.0,CS-12
6426,75.0,1,177.000000,80.000000,0.0,34.0,AT-6 C 5.5


In [29]:
df_encoded = pd.get_dummies(other_features_df_modified, columns=['nurse', 'site', 'device'], drop_first=True)

In [30]:
df_encoded

Unnamed: 0,age,sex,height,weight,nurse_1.0,nurse_2.0,nurse_3.0,nurse_4.0,nurse_5.0,nurse_6.0,...,device_AT-6 C,device_AT-6 C 5.0,device_AT-6 C 5.3,device_AT-6 C 5.5,device_AT-6 C 5.6,device_AT-6 C 5.8,device_AT-60 3,device_CS-12,device_CS-12 E,device_CS100 3
0,54.0,0,166.796356,69.841845,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,54.0,0,166.796356,69.841845,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,55.0,0,166.796356,69.841845,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,29.0,1,164.000000,56.000000,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,57.0,0,166.796356,69.841845,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6423,81.0,0,178.000000,70.000000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6424,88.0,0,152.000000,45.000000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6425,83.0,1,166.796356,69.841845,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
6426,75.0,1,177.000000,80.000000,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [31]:
from sklearn.preprocessing import StandardScaler

columns_to_standardize = ['age', 'weight', 'height']
df_subset = df_encoded[columns_to_standardize]
scaler = StandardScaler()
df_standardized_subset = scaler.fit_transform(df_subset)
df_encoded[columns_to_standardize] = df_standardized_subset

In [32]:
df_encoded.values

array([[-4.37517173e-01,  0.00000000e+00, -5.14808661e-15, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [-4.37517173e-01,  0.00000000e+00, -5.14808661e-15, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       [-3.80990909e-01,  0.00000000e+00, -5.14808661e-15, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       ...,
       [ 1.20174448e+00,  1.00000000e+00, -5.14808661e-15, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 7.49534370e-01,  1.00000000e+00,  1.84820846e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-1.96372630e+00,  0.00000000e+00, -5.14808661e-15, ...,
         1.00000000e+00,  0.00000000e+00,  0.00000000e+00]])

In [33]:
import torch
of_np = torch.tensor(df_encoded.values)

In [34]:
of_np

tensor([[-4.3752e-01,  0.0000e+00, -5.1481e-15,  ...,  0.0000e+00,
          0.0000e+00,  1.0000e+00],
        [-4.3752e-01,  0.0000e+00, -5.1481e-15,  ...,  0.0000e+00,
          0.0000e+00,  1.0000e+00],
        [-3.8099e-01,  0.0000e+00, -5.1481e-15,  ...,  1.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 1.2017e+00,  1.0000e+00, -5.1481e-15,  ...,  1.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 7.4953e-01,  1.0000e+00,  1.8482e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [-1.9637e+00,  0.0000e+00, -5.1481e-15,  ...,  1.0000e+00,
          0.0000e+00,  0.0000e+00]], dtype=torch.float64)

In [35]:
of_np.shape

torch.Size([6428, 70])

In [36]:
torch.save(of_np, '/content/drive/Shareddrives/OG_Mood_Lyric_Processors_CS272/CS284A/other_features_file.pt')

In [37]:
final_df = pd.merge(result, label_df, how='inner', left_on='unique_id', right_on='unique_id')

In [38]:
final_df = final_df.drop(columns=['unique_id','I','II','III','aVF', 'aVR', 'aVL', 'V1', 'V2','V3','V4','V5','V6'])

In [39]:
final_df

Unnamed: 0,concatenated_values,ritmi
0,"[-0.005, -0.005, -0.005, -0.005, -0.005, -0.00...",2
1,"[-0.005, -0.005, -0.005, -0.005, -0.005, -0.00...",1
2,"[-0.17, -0.17, -0.17, -0.17, -0.17, -0.17, -0....",0
3,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.008, 0.013, 0...",2
4,"[0.015, 0.015, 0.015, 0.015, 0.015, 0.015, 0.0...",2
...,...,...
6423,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, -0.001, 0....",2
6424,"[-0.02, -0.02, -0.02, -0.02, -0.02, -0.02, -0....",2
6425,"[0.095, 0.095, 0.095, 0.094, 0.097, 0.095, 0.0...",0
6426,"[-0.145, -0.145, -0.145, -0.145, -0.145, -0.14...",1


In [40]:
final_df['concatenated_values'].values.shape

(6428,)

In [41]:
max_length = max([len(final_df['concatenated_values'][i]) for i in range(final_df.shape[0])])
print(max_length)

8400


In [42]:
import torch
from torch.nn.utils.rnn import pad_sequence
torch_values = [torch.tensor(final_df['concatenated_values'][i],dtype=torch.float32) for i in range(final_df.shape[0])]
padded_sequences = pad_sequence(torch_values, batch_first=True, padding_value=0)
input_size = padded_sequences.shape

In [43]:
padded_sequences.shape

torch.Size([6428, 8400])

In [44]:
torch.save(padded_sequences, '/content/drive/Shareddrives/OG_Mood_Lyric_Processors_CS272/CS284A/output_file.pt')

In [45]:
input_size

torch.Size([6428, 8400])

In [46]:
labels = torch.tensor(label_df['ritmi'].values)

In [47]:
torch.save(labels, '/content/drive/Shareddrives/OG_Mood_Lyric_Processors_CS272/CS284A/labels.pt')