In [2]:
import pandas as pd
import numpy as np
from utils import get_most_popular_diagnoses
from tqdm import tqdm
import pickle

In [3]:
train_df = pd.read_csv('./data/train_data_complaints_repeats_doctors.csv', index_col=0)

In [4]:
diagnoses = get_most_popular_diagnoses(train_df, 0.8)
diags_to_id = {diag:i for i, diag in enumerate(diagnoses)}

In [5]:
train_df['Id_диагноза'] = train_df.Код_диагноза.apply(
    lambda x: diags_to_id[x] if x in diags_to_id else len(diags_to_id))

In [6]:
gb = train_df.groupby('Id_Пациента')
onehot_dict = dict()
for key, df in tqdm(gb):
    onehot_dict[key] = np.bincount(df.Id_диагноза,minlength=len(diags_to_id)+1) / len(df)

100%|██████████| 29989/29989 [00:03<00:00, 7925.76it/s]


In [41]:
patients_diagnose_list = []
patients_diagnose_list.append(np.zeros(len(diags_to_id)+1))
for key in sorted(onehot_dict):
    patients_diagnose_list.append(onehot_dict[key])
patients_diagnoses = np.array(patients_diagnose_list)

In [42]:
train_df['Id_диагноза_onehot'] = train_df.Id_Пациента.apply(lambda x:onehot_dict[x])

In [43]:
train_patient_arr = np.array(list(train_df.Id_диагноза_onehot))

In [44]:
with open('./data/diags_to_inds.pkl', 'wb') as f:
    pickle.dump(diags_to_id,f)

In [45]:
np.save('./data/patients_diagnoses.npy',train_patient_arr)

In [46]:
train_encoder = np.array(patients_diagnose_list)

In [54]:
import keras
from keras.models import Model
from keras.layers import Input, Dense
from keras.callbacks import TensorBoard, EarlyStopping
from utils import TrainValTensorBoard

In [55]:
input_layer = Input(shape=(train_patient_arr.shape[1],))
encoded = Dense(100, activation='relu')(input_layer)
encoded = Dense(10, activation='relu')(encoded)

decoded = Dense(100, activation='relu')(encoded)
decoded = Dense(train_patient_arr.shape[1], activation='sigmoid')(decoded)

In [56]:
model = Model(input_layer, decoded)
model.compile(optimizer='ADAM', loss='mse')

In [57]:
from sklearn.model_selection import train_test_split

In [58]:
X_train, X_test = train_test_split(patients_diagnoses)

In [59]:
model.fit(x=X_train,y=X_train,
          epochs=2000,
          batch_size=256,
          shuffle=True,
          validation_data=(X_test,X_test),
          callbacks=[
    TrainValTensorBoard(log_dir='./autoencoder_logs'),
    EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=0, mode='auto')
],verbose=0)

<keras.callbacks.History at 0x7fb7bde1eb38>

In [73]:
encoder = Model(input_layer, encoded)

In [74]:

patients_encoded = dict()
for patient in np.unique(train_df.Id_Пациента):
    patients_encoded[patient] = (encoder.predict(np.expand_dims(onehot_dict[patient],0)))

In [81]:
zero_encoding = encoder.predict(np.zeros((1,len(diags_to_id)+1)))

In [60]:
test_df = pd.read_csv('./data/train_data_complaints_repeats_doctors.csv', index_col=0)

In [85]:
test_df['patient_encoded'] = test_df.Id_Пациента.apply(lambda x: patients_encoded[x][0] if x in patients_encoded else zero_encoding)

In [87]:
test_df.to_csv('./data/train_data_complaints_repeats_doctors_patenc.csv')