In [467]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import numpy as np
import tensorflow.keras as keras
from tensorflow.keras import models, layers


In [468]:
import pandas as pd
import os

In [469]:
def medication_data(income_path):
    csv_path = os.path.join(income_path,"mu.csv")
    return pd.read_csv(csv_path,sep=',',engine='python')

In [470]:
result = medication_data(".")

In [471]:
len(result)

479230

In [472]:
result.head()

Unnamed: 0,order,patient_number,presc_date,presc_dept,order_date,presc_code,presc_information,order_code,order_name,sum_medic_count,medic_duration
0,0,10505120,2016-08-18,DM,2016-08-18,L209,"Atopic dermatitis, unspecified",VAS10-O,Vaseline 10g*,2,1
1,1,10505120,2016-08-18,DM,2016-08-18,L209,"Atopic dermatitis, unspecified",PIMEC-CR1,Elidel Cream 1% 10g*,2,1
2,2,10505120,2016-08-18,DM,2016-08-18,L209,"Atopic dermatitis, unspecified",PDV-CR,Prednisolone Valeroacetate 1.3% 20g,2,1
3,3,10105557,2016-01-26,OS,2016-01-26,T143,"Dislocation, sprain and strain of unspecified ...",NAPRO5SOM2,Vimovo tab 500/20mg,2,15
4,4,10064596,2016-01-21,OS,2016-01-21,G629,"Polyneuropathy, unspecified",LAYLA,Layla tab,2,30


In [473]:
result.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 479230 entries, 0 to 479229
Data columns (total 11 columns):
order                479230 non-null int64
patient_number       479230 non-null int64
presc_date           479230 non-null object
presc_dept           479230 non-null object
order_date           479230 non-null object
presc_code           479230 non-null object
presc_information    479230 non-null object
order_code           479230 non-null object
order_name           478743 non-null object
sum_medic_count      479230 non-null int64
medic_duration       479230 non-null int64
dtypes: int64(4), object(7)
memory usage: 40.2+ MB


In [474]:
result['patient_number'].value_counts()

10271364    834
10016177    709
10595340    684
10669901    659
10315998    612
           ... 
10652216      1
10651525      1
10481386      1
10409677      1
10046097      1
Name: patient_number, Length: 37014, dtype: int64

In [475]:
result.describe()

Unnamed: 0,order,patient_number,sum_medic_count,medic_duration
count,479230.0,479230.0,479230.0,479230.0
mean,239614.5,10440130.0,1.82971,14.948079
std,138341.929089,223541.5,1.063725,26.829278
min,0.0,10000100.0,1.0,1.0
25%,119807.25,10243110.0,1.0,1.0
50%,239614.5,10520120.0,2.0,4.0
75%,359421.75,10640610.0,2.0,14.0
max,479229.0,10715750.0,24.0,365.0


In [476]:
import matplotlib.pyplot as plt

In [477]:
plt.show()

In [478]:
LABEL_COLUMN ='sum_medic_count'

In [479]:
LABELS = [0,1]

In [480]:
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(file_path,
                                                    batch_size=32, # Artificially small to make examples easier to show.
                                                    label_name=LABEL_COLUMN,
                                                    na_value="?",
                                                    num_epochs=1,
                                                    num_parallel_reads=1,
                                                    ignore_errors=True,
                            
                                                    **kwargs)
    return dataset

In [481]:
PATH = './mu.csv'

In [482]:
raw_train_data = get_dataset(PATH)

In [483]:
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print("{:20s}:{}".format(key,value.numpy()))
        print("label:",label.numpy())

In [484]:
show_batch(raw_train_data)

order               :[9336 5887  842 4197 8051 7520 8677 7714 1523 6539 8719 1035  154 2261
 8193 1362 9262 3847 6086 7027 2372 8584   22 8488 6870 6786 7123 9135
 2195 8382 4355 4028]
patient_number      :[10624851 10577163 10124887 10195824 10142566 10497931 10001574 10163191
 10511990 10549142 10599467 10005676 10206758 10234893 10251255 10561527
 10169696 10560814 10629008 10035410 10311450 10597183 10276378 10620148
 10072802 10128299 10068451 10462087 10096720 10481158 10598858 10148196]
presc_date          :[b'2016-01-25' b'2016-01-11' b'2016-01-27' b'2016-01-22' b'2016-01-25'
 b'2016-03-31' b'2016-03-23' b'2016-06-10' b'2016-02-03' b'2016-02-15'
 b'2016-01-15' b'2016-04-11' b'2016-02-01' b'2016-01-29' b'2016-02-15'
 b'2016-04-27' b'2016-03-17' b'2016-02-23' b'2016-01-04' b'2016-03-23'
 b'2016-03-21' b'2016-01-12' b'2016-01-19' b'2016-01-19' b'2016-02-05'
 b'2016-03-16' b'2016-01-14' b'2016-04-28' b'2016-02-04' b'2016-03-16'
 b'2016-02-16' b'2016-01-25']
presc_dept          :[b'

In [485]:
contri_var = result.columns[result.dtypes!='object']

In [486]:
list(contri_var)

['order', 'patient_number', 'sum_medic_count', 'medic_duration']

In [487]:
class PackNumericFeatures(object):
    def __init__(self, names):
        self.names = names

    def __call__(self, features, labels):
        numeric_freatures = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_freatures]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features['numeric'] = numeric_features
        
        matches = tf.greater_equal(3, labels)
        onehot = tf.cast(matches, tf.float32)
        labels = onehot
        
        return features, labels

In [488]:
NUMERIC_FEATURES = list(contri_var)
NUMERIC_FEATURES

['order', 'patient_number', 'sum_medic_count', 'medic_duration']

In [489]:
packed_train_data = raw_train_data.map(PackNumericFeatures(NUMERIC_FEATURES))

KeyError: in converted code:

    <ipython-input-487-d8ac8ee5271e>:6 __call__  *
        numeric_freatures = [features.pop(name) for name in self.names]
    C:\Users\hyun\Anaconda3\envs\ilhyun\lib\site-packages\tensorflow_core\python\autograph\impl\api.py:396 converted_call
        return py_builtins.overload_of(f)(*args)

    KeyError: 'sum_medic_count'


In [445]:
desc = result[NUMERIC_FEATURES].describe()

In [446]:
desc

Unnamed: 0,order,patient_number,sum_medic_count,medic_duration
count,479230.0,479230.0,479230.0,479230.0
mean,239614.5,10440130.0,1.82971,14.948079
std,138341.929089,223541.5,1.063725,26.829278
min,0.0,10000100.0,1.0,1.0
25%,119807.25,10243110.0,1.0,1.0
50%,239614.5,10520120.0,2.0,4.0
75%,359421.75,10640610.0,2.0,14.0
max,479229.0,10715750.0,24.0,365.0


In [447]:
MEAN = np.array(desc.T['mean'])

In [448]:
STD = np.array(desc.T['std'])

In [449]:
def normalize_numeric_data(data,mean,std):
    return (data-mean)/std

In [450]:
import functools

In [451]:
normalizer = functools.partial(normalize_numeric_data,mean=MEAN,std=STD)
numeric_column = tf.feature_column.numeric_column('numeric',normalizer_fn=normalizer,shape=[len(NUMERIC_FEATURES)])

In [452]:
numeric_columns = [numeric_column]
numeric_column

NumericColumn(key='numeric', shape=(4,), default_value=None, dtype=tf.float32, normalizer_fn=functools.partial(<function normalize_numeric_data at 0x000001E638794048>, mean=array([2.39614500e+05, 1.04401327e+07, 1.82971016e+00, 1.49480792e+01]), std=array([1.38341929e+05, 2.23541511e+05, 1.06372516e+00, 2.68292779e+01])))

In [453]:
categorical = list(result.columns[result.dtypes == 'object'])
categorical

['presc_date',
 'presc_dept',
 'order_date',
 'presc_code',
 'presc_information',
 'order_code',
 'order_name']

In [454]:
def get_keys(data,feature):
    return list(result[feature].value_counts().keys())

In [456]:
get_keys(result,"order_code")

['XNS50C',
 'LDPIZ-S',
 'TROPHE-T',
 'BIOF250-K',
 'CP4PHE1-S',
 'XDNK2',
 'BUD2-RP',
 'PD5',
 'IBUP-S',
 'XNS1L',
 'CODENAL',
 'APH6ER',
 'APH32-S',
 'PSUTRI-S',
 'NAPRO5SOM',
 'CNZEP',
 'ASP1PR',
 'LCTZIN',
 'COPIVY-S',
 'COPIVY-K',
 'VAS10-O',
 'AMBR-S',
 'RASUBI1W',
 'XAMBR15',
 'CAROLF',
 'ACECL',
 'HURON15-T',
 'TULOB.5-H',
 'XNS100',
 'QUET25',
 'PDV-CR',
 'DIBUP',
 'MOM514-NS',
 'TAMSUD',
 'ATORV10',
 'SKDASE',
 'NORT1',
 'NORV',
 'MGO5',
 'XPCMOL',
 'REBAM',
 'GINKB80',
 'CELEB',
 'METFOR5XR',
 'PSEP',
 'HURON15W-T',
 'PREGA75',
 'CLOPID',
 'PANTO20W',
 'DEXLANS3',
 'REBAM1U',
 'LZPAM',
 'LINAGL5',
 'RASUBID',
 'MEQUIT',
 'MONTEL10',
 'LOFLO-T',
 'CPDXM',
 'STILL',
 'BEPOT10',
 'ACYST2',
 'AMX7CL-S',
 'ACEBR100',
 'XNS500',
 'EPERIS',
 'LAFUT10',
 'LAYLA',
 'MET1GXR',
 'AMBR30',
 'TRIME300S',
 'FURO',
 'ALPRA2',
 'AST1W',
 'XDNK35',
 'CTAMET5W',
 'XMEXO',
 'FEROBA',
 'DULOX30',
 'TRIME-S',
 'DIQUA-T',
 'ATORV20',
 'CPDXM-S',
 'MOXIFL-T',
 'XTAMC40',
 'SALB-NB',
 'CACA5D10',
 '

In [457]:
def get_categories(data,c_list):
    category={}
    for i in c_list[:-1]:
        category[i] = get_keys(data,i)
    return category

In [459]:
CATEGORIES = get_categories(result, categorical)
CATEGORIES['order_code']

['XNS50C',
 'LDPIZ-S',
 'TROPHE-T',
 'BIOF250-K',
 'CP4PHE1-S',
 'XDNK2',
 'BUD2-RP',
 'PD5',
 'IBUP-S',
 'XNS1L',
 'CODENAL',
 'APH6ER',
 'APH32-S',
 'PSUTRI-S',
 'NAPRO5SOM',
 'CNZEP',
 'ASP1PR',
 'LCTZIN',
 'COPIVY-S',
 'COPIVY-K',
 'VAS10-O',
 'AMBR-S',
 'RASUBI1W',
 'XAMBR15',
 'CAROLF',
 'ACECL',
 'HURON15-T',
 'TULOB.5-H',
 'XNS100',
 'QUET25',
 'PDV-CR',
 'DIBUP',
 'MOM514-NS',
 'TAMSUD',
 'ATORV10',
 'SKDASE',
 'NORT1',
 'NORV',
 'MGO5',
 'XPCMOL',
 'REBAM',
 'GINKB80',
 'CELEB',
 'METFOR5XR',
 'PSEP',
 'HURON15W-T',
 'PREGA75',
 'CLOPID',
 'PANTO20W',
 'DEXLANS3',
 'REBAM1U',
 'LZPAM',
 'LINAGL5',
 'RASUBID',
 'MEQUIT',
 'MONTEL10',
 'LOFLO-T',
 'CPDXM',
 'STILL',
 'BEPOT10',
 'ACYST2',
 'AMX7CL-S',
 'ACEBR100',
 'XNS500',
 'EPERIS',
 'LAFUT10',
 'LAYLA',
 'MET1GXR',
 'AMBR30',
 'TRIME300S',
 'FURO',
 'ALPRA2',
 'AST1W',
 'XDNK35',
 'CTAMET5W',
 'XMEXO',
 'FEROBA',
 'DULOX30',
 'TRIME-S',
 'DIQUA-T',
 'ATORV20',
 'CPDXM-S',
 'MOXIFL-T',
 'XTAMC40',
 'SALB-NB',
 'CACA5D10',
 '

In [198]:
categorical_columns = []
for feature, vocab in CATEGORIES.items():
    cat_col = tf.feature_column.categorical_column_with_vocabulary_list(key=feature, 
                                                                        vocabulary_list=vocab)
    categorical_columns.append(tf.feature_column.indicator_column(cat_col))

In [460]:
categorical_columns[0]

IndicatorColumn(categorical_column=VocabularyListCategoricalColumn(key=' order_date', vocabulary_list=('2016-12-19', '2016-12-12', '2016-12-05', '2016-11-28', '2016-12-14', '2016-11-30', '2016-12-13', '2016-12-21', '2016-12-15', '2016-12-16', '2016-12-26', '2016-12-20', '2016-11-14', '2016-12-23', '2016-12-22', '2016-11-07', '2016-12-02', '2016-11-21', '2016-12-01', '2016-11-01', '2016-11-29', '2016-11-02', '2016-12-27', '2016-11-16', '2016-12-07', '2016-12-28', '2016-11-23', '2016-12-09', '2016-12-06', '2016-11-22', '2016-12-30', '2016-11-15', '2016-11-24', '2016-12-08', '2016-11-08', '2016-11-18', '2016-11-11', '2016-11-09', '2016-11-17', '2016-12-29', '2016-11-10', '2016-11-25', '2016-11-03', '2016-11-04', '2016-12-03', '2016-12-10', '2016-12-17', '2016-12-24', '2016-11-12', '2016-11-05', '2016-11-19', '2016-11-26', '2016-12-31', '2016-12-18', '2016-12-25', '2016-12-11', '2016-12-04', '2016-11-13', '2016-11-27', '2016-11-06', '2016-11-20'), dtype=tf.string, default_value=-1, num_oov

In [461]:
categorical_layer = tf.keras.layers.DenseFeatures(categorical_columns)


In [462]:
preprocessing_layer = tf.keras.layers.DenseFeatures(categorical_columns+numeric_columns)