In [1]:
import os
import pandas as pd
from scipy.stats import zscore

In [2]:
from utils import (fill_dates, 
                   classify_diabetes,
                   classify_gfr,
                   get_gfr)
from constants import Diabetes_level, GFR_level

In [7]:
class Summary:
    base_on = 'time'
    period = '30D'
    higher_is_best = ['GFR']
    def __init__(self, folder_data) -> None:
        self.global_dic = {i.replace('.csv', ''): pd.read_csv(f'{folder_data}/' + i) 
                            for i in os.listdir(f'{folder_data}')}
        self.meds = pd.read_csv(f'{folder_data}/T_meds.csv')
        self.stage = pd.read_csv(f'{folder_data}/T_stage.csv')
        self.drugs = self.meds.drug.unique().tolist()
        self.ids = self.meds.id.unique().tolist()
        for key in self.global_dic.keys():
            cols = self.global_dic[key].columns
            cols = [i.replace(i,f'{i}_{key.replace("T_","")}') if (i != self.base_on and i != 'id') else i for i in cols ]
            self.global_dic[key].columns = cols

    def get_info_by_id(self, id) -> dict:
        return {key: value.loc[value.id == id]
                for key, value in self.global_dic.items()}

    def merge_data_(self, dic):
        df = pd.merge(dic['T_creatinine'].drop(columns=['id']), dic['T_DBP'].drop(columns=['id']), on=self.base_on, how='outer')
        df = pd.merge(df, dic['T_SBP'].drop(columns=['id']), on=self.base_on, how='outer')
        df = pd.merge(df, dic['T_HGB'].drop(columns=['id']), on=self.base_on, how='outer')
        df = pd.merge(df, dic['T_glucose'].drop(columns=['id']), on=self.base_on, how='outer')
        df = pd.merge(df, dic['T_ldl'].drop(columns=['id']), on=self.base_on, how='outer')
        df['id'] = dic['T_creatinine'].id.iloc[0]
        return df.sort_values(by=self.base_on)

    def add_drugs(self, id, df):
        meds = self.meds.loc[self.meds.id == id]
        temp = pd.DataFrame()
        for i, drug in enumerate(meds.drug):
            p = pd.DataFrame.from_dict({'dates': pd.date_range(start='2000-01-01', 
                                                                freq='1D', 
                                                                periods=meds.end_day.iloc[i]+1).to_list()[meds.start_day.iloc[i]:],
                drug: meds.daily_dosage.iloc[i]
                })
            temp = pd.concat([temp, p])
        temp = temp.sort_values(by='dates').reset_index(drop=True).fillna(method='ffill').groupby(pd.Grouper(key='dates', freq='1D')).sum()
        df = pd.merge(df, temp, on='dates', how='outer')
        for i in self.drugs:
            if i not in df.columns:
                df[i] = [0 for i in range(len(df))]
            df[i].fillna(0, inplace=True)
        return df

    def group(self, df):
        lowers = [i for i in df if i not in self.higher_is_best]
        z, zz = zip(*df.groupby(pd.Grouper(key='dates', freq=self.period)))
        m = [0]
        for i in range(len(z)-1):
            m += [m[-1] + ((z[i+1] - z[i]).days)]
        lowers = df[lowers].groupby(pd.Grouper(key='dates', freq=self.period)).max()
        highers = df[self.higher_is_best + ['dates']].groupby(pd.Grouper(key='dates', freq=self.period)).min()
        df = highers.join(lowers)
        df['time'] = m
        return df.reset_index(drop=True)

    def prepare_sample(self, id):
        df = self.merge_data_(self.get_info_by_id(id))
        _, race, gender, age = self.get_info_by_id(id)['T_demo'].values[0]
        df.insert(loc=0, column = 'GFR',
                    value=get_gfr(df.value_creatinine, race, gender, age))
        df = classify_diabetes(df)
        df = classify_gfr(df)
        df = fill_dates(df).fillna(method='ffill')
        df = self.add_drugs(id, df)
        last = len(df)
        df = self.group(df).drop(columns=['id'])
        cols = ['time'] + [i for i in df.columns if i != 'time']
        return df[cols]#.apply(zscore).fillna(0)

    def get_sample(self, id):
        return self.prepare_sample(id), self.stage.Stage_Progress.iloc[id]
        

In [8]:
summary = Summary('data')

In [9]:
df, stage = summary.get_sample(297)

In [10]:
stage

True

In [11]:
df

Unnamed: 0,time,GFR,GFR_level_G1,GFR_level_G2,GFR_level_G3a,value_creatinine,value_DBP,value_SBP,value_HGB,Diab_diabetes,...,bisoprolol,atenolol,lovastatin,olmesartan,canagliflozin,dapagliflozin,telmisartan,labetalol,nebivolol,propranolol
0,0,54.034854,0.0,0.0,1.0,1.25,66.89,145.56,13.97,0.0,...,0,0,0,0,0,0,0,0,0,0
1,30,54.034854,0.0,0.0,1.0,1.25,66.89,145.56,13.97,0.0,...,0,0,0,0,0,0,0,0,0,0
2,60,54.034854,0.0,0.0,1.0,1.25,66.89,145.56,13.97,0.0,...,0,0,0,0,0,0,0,0,0,0
3,90,54.034854,1.0,0.0,1.0,1.25,66.89,145.56,14.61,1.0,...,0,0,0,0,0,0,0,0,0,0
4,120,55.645385,1.0,0.0,0.0,1.22,59.59,95.32,14.61,1.0,...,0,0,0,0,0,0,0,0,0,0
5,150,55.645385,1.0,0.0,0.0,1.22,59.59,95.32,14.61,1.0,...,0,0,0,0,0,0,0,0,0,0
6,180,55.645385,1.0,0.0,0.0,1.22,59.59,95.32,14.61,1.0,...,0,0,0,0,0,0,0,0,0,0
7,210,55.645385,1.0,0.0,0.0,1.22,62.15,95.32,14.33,1.0,...,0,0,0,0,0,0,0,0,0,0
8,240,55.645385,1.0,0.0,0.0,1.22,62.15,94.41,14.33,1.0,...,0,0,0,0,0,0,0,0,0,0
9,270,55.645385,1.0,0.0,0.0,1.22,62.15,94.41,14.33,1.0,...,0,0,0,0,0,0,0,0,0,0
