In [1]:
import csv
import re
import tensorflow as tf
%matplotlib inline
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import numpy as np

In [2]:
def readCSV(csv_path, skip_firstRow=False, separator=',', encoding="utf-8-sig"):
    data = []
    with open(csv_path, 'rt', encoding=encoding) as csvfile:
        csvreader = csv.reader(csvfile, delimiter=separator)
        for row in csvreader:
            if not skip_firstRow:
                data.append(row)
            skip_firstRow = False
    return data

In [3]:
list_one = readCSV('lungcheck_200813_smoke.csv',skip_firstRow=True)

In [4]:
list_one[:10]

[['27376764', '20170208', '68', '1갑/일', '1', '30년', '30', '38', '38', '1'],
 ['27377046', '20170307', '75', '반갑', '0.5', '50년', '50', '25', '25', '1'],
 ['27392425', '20170226', '67', '10개피', '0.5', '25년', '25', '42', '42', '1'],
 ['27392425', '20180318', '68', '10개피 ', '0.5', '25년', '25', '43', '43', '1'],
 ['27396612',
  '20170301',
  '73',
  '10개피/일 ',
  '0.5',
  '60년간 ',
  '60',
  '13',
  '13',
  '1'],
 ['27396612',
  '20170220',
  '73',
  '10개피/일',
  '0.5',
  '약 60년간',
  '60',
  '13',
  '13',
  '1'],
 ['27390289', '20170310', '74', '30개피/일', '1.5', '50년', '50', '24', '24', '1'],
 ['27393101', '20180703', '44', '1갑', '1', '20년', '20', '24', '24', '1'],
 ['27414844', '20170223', '74', '10개피', '0.5', '40년', '40', '34', '34', '1'],
 ['27414844', '20170321', '74', '1갑', '1', '60년', '60', '14', '14', '1']]

In [5]:
x=[]
xmax=0
for i in list_one:
    x.append(i[3])
    xmax=max(xmax,len(i[3]))
xmax

68

In [6]:
y=[]
ymax=0
for i in list_one:
    try:
        y.append(float(i[4]))
        ymax=max(ymax,float(i[4]))
    except:
        y.append(0)
ymax

60.0

In [7]:
src_tokenizer = Tokenizer()
src_tokenizer.fit_on_texts(x)

In [8]:
len(src_tokenizer.word_index) + 1

335

In [30]:
x_train_real, x_test_real, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=False, random_state=1004)

In [32]:
len(y_train)

6006

In [33]:
x_train=src_tokenizer.texts_to_sequences(x_train_real)
x_test=src_tokenizer.texts_to_sequences(x_test_real)

In [34]:
x_test_real[:10]

['1', '1갑', '15개피', '15개피', '2갑/일', '1', '1갑반/1일', '한갑 반/1일', '01월 01일', '1갑']

In [35]:
x_test[:10]

[[3], [2], [11], [11], [7, 1], [3], [26, 6], [19, 35, 6], [8, 10], [2]]

In [36]:
for i in x_train:
    xmax_2=max(xmax,len(i))
xmax_2

68

In [37]:
index_to_word = src_tokenizer.index_word

In [38]:
x_train = pad_sequences(x_train, padding='post', maxlen=xmax_2)
x_test = pad_sequences(x_test, padding='post', maxlen=xmax_2)

In [39]:
y_train = to_categorical(y_train,num_classes=ymax+1)
y_test = to_categorical(y_test,num_classes=ymax+1)

In [40]:
x_train.shape

(6006, 68)

In [41]:
x_train[0]

array([2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [42]:
y_train.shape

(6006, 61)

In [43]:
y_train[0]

array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)

In [44]:
words_input = tf.keras.layers.Input(shape=(xmax_2, ),dtype='int32', name='modelInput')
words_input

<tf.Tensor 'modelInput_1:0' shape=(None, 68) dtype=int32>

In [45]:
words = tf.keras.layers.Embedding(input_dim=335, output_dim=128, mask_zero=True)(words_input)
words

<tf.Tensor 'embedding_1/Identity:0' shape=(None, 68, 128) dtype=float32>

In [46]:
ff = tf.keras.layers.Flatten()(words)

In [47]:
denseLayer = tf.keras.layers.Dense(61, activation='softmax')(ff)

In [48]:
modelFunc = tf.keras.Model(inputs=words_input, outputs=denseLayer)

In [49]:
modelFunc.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
modelFunc.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
modelInput (InputLayer)      [(None, 68)]              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 68, 128)           42880     
_________________________________________________________________
flatten_1 (Flatten)          (None, 8704)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 61)                531005    
Total params: 573,885
Trainable params: 573,885
Non-trainable params: 0
_________________________________________________________________


In [50]:
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

In [51]:
modelFunc.fit(x_train, y_train, batch_size=128, epochs=20)

Train on 6006 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x224e3e7ee48>

In [52]:
modelFunc.evaluate(x_test, y_test,verbose=0)

[0.14063263780465632, 0.97047395]

In [53]:
x_test[0]

array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0])

In [66]:
predictions=modelFunc.predict(x_test)

In [71]:
for i in range(len(x_test)):
    if np.argmax(predictions[i]) == 0 or np.argmax(predictions[i]) == 1:
        pass
    elif np.argmax(predictions[i]) >3:
        print('실제값: {} 예측값: {}'.format(np.argmax(y_test[i]), np.argmax(predictions[i])))
        if np.argmax(y_test[i]) != np.argmax(predictions[i]):
            print(x_test_real[i])
    else:
        pass
        # print(i)

실제값: 20 예측값: 20
실제값: 20 예측값: 20
실제값: 20 예측값: 20
실제값: 5 예측값: 5
실제값: 20 예측값: 20
실제값: 4 예측값: 4
실제값: 10 예측값: 10
실제값: 1 예측값: 4
3~4갑 ->1갑 감소
실제값: 20 예측값: 20
실제값: 2 예측값: 20
20~40개
실제값: 5 예측값: 5
실제값: 0 예측값: 40
   40 PYR ex-smoker (10yr quit) 
실제값: 10 예측값: 10
실제값: 20 예측값: 20
실제값: 30 예측값: 30
실제값: 2 예측값: 40
40 개피/1일
실제값: 20 예측값: 20
실제값: 20 예측값: 20
실제값: 10 예측값: 10
실제값: 40 예측값: 40
실제값: 20 예측값: 20
실제값: 0 예측값: 10
10-15개비
실제값: 10 예측값: 10
실제값: 20 예측값: 20
실제값: 20 예측값: 20
실제값: 0 예측값: 20
20/40
실제값: 20 예측값: 20
실제값: 20 예측값: 20
실제값: 10 예측값: 10
실제값: 10 예측값: 10
실제값: 6 예측값: 5
5-6/일
실제값: 30 예측값: 30
실제값: 20 예측값: 20
실제값: 10 예측값: 10
실제값: 4 예측값: 4


In [None]:
inx=2111

In [None]:

print('예측값: {} 실제값: {}'.format(np.argmax(y_test[inx]), np.argmax(predictions[inx])))