# Digital Medicine 2021 


In [None]:
import os
import string
from tqdm import tqdm
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, wordnet
import nltk.stem
import pandas as pd
from collections import Counter

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)



def PreprocessData_w_Lemmatizer(dir_path, phase='train'):
    allFileList = os.listdir(dir_path)
    allFileList.sort(key=lambda x:x.rstrip('.txt').split('_')[-1])
    contents = []
    labels = []
    text_idx = []
    word_list = ['obesity', 'obestity', 'obese', 'obeseskin', 'panniculectomy', 'lose weight']

        
    remove = str.maketrans('', '', string.punctuation)
    l = WordNetLemmatizer()
    allFile = tqdm(allFileList)
    for idx, file_name in enumerate(allFile):
        label = file_name.split('_')[0]
        file = open(os.path.join(dir_path, file_name), 'r')
        content = file.read().lower()
        content = content.translate(remove)
        content = content.replace('\n', ' ')
        content = content.split(' ')
        content = ' '.join([''.join(filter(str.isalpha, x)) for x in content])
        content = [l.lemmatize(ws, get_wordnet_pos(ws)) for ws in nltk.word_tokenize(content)]
        content = [w for w in content if not w in stopwords.words('english')]
        content = ' '.join(content)
        contents.append(content)

        if phase != 'valid':
            if label == 'U' or label == 'N':
                labels.append(0)
            else:
                labels.append(1)
        if phase == 'valid':    
          for word in word_list:
              if word in content:
                  text_idx.append(file_name)
        elif phase == 'test':
          for word in word_list:
              if word in content:
                  text_idx.append(idx)

    if phase != 'train':
        return contents, labels, text_idx
    else:
        return contents, labels



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Load dataset

In [None]:
train_dir_path = '/content/drive/MyDrive/數位醫學/Train_Textual'
test_dir_path = '/content/drive/MyDrive/數位醫學/Test_Intuitive'
valid_dir_path = '/content/drive/MyDrive/數位醫學/Validation'
train_contents, train_labels = PreprocessData_w_Lemmatizer(train_dir_path, phase='train')
test_contents, test_labels, text_idx = PreprocessData_w_Lemmatizer(test_dir_path, phase='test')
val_contents, _, val_idx = PreprocessData_w_Lemmatizer(valid_dir_path, phase='valid')

100%|██████████| 400/400 [02:12<00:00,  3.02it/s]
100%|██████████| 400/400 [02:13<00:00,  3.01it/s]
100%|██████████| 50/50 [00:16<00:00,  3.10it/s]


## Tokenization

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
MAX_NB_WORDS = 20000
MAX_SEQUENCE_LENGTH = 2000

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(train_contents)
train_sequences = tokenizer.texts_to_sequences(train_contents)
test_sequences = tokenizer.texts_to_sequences(test_contents)
val_sequences = tokenizer.texts_to_sequences(val_contents)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

train_data = pad_sequences(train_sequences, maxlen=MAX_SEQUENCE_LENGTH)
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
val_data = pad_sequences(val_sequences, maxlen=MAX_SEQUENCE_LENGTH)

print('Shape of training data tensor:', train_data.shape)
print('Shape of testing data tensor:', test_data.shape)
print('Shape of validation data tensor:', val_data.shape)

Found 15955 unique tokens.
Shape of training data tensor: (400, 2000)
Shape of testing data tensor: (400, 2000)
Shape of validation data tensor: (50, 2000)


## Load pre-trained word embeddings (GloVe embeddings) 

In [None]:
import numpy as np
embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [None]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

## Word Embeddings

In [None]:
from keras.layers import Embedding
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

# Process labels of text files

In [None]:
from keras.utils import np_utils 
transfer_train_labels = np_utils.to_categorical(np.asarray(train_labels), num_classes=2)
transfer_test_labels = np_utils.to_categorical(np.asarray(test_labels), num_classes=2)

## Set up Bidirectional LSTM model

In [None]:
import tensorflow as tf
from keras.layers import Dense, Input, Flatten, Dropout, LSTM, GRU, Bidirectional, SimpleRNN, CuDNNLSTM, Activation, Conv2D, MaxPooling2D
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPooling1D, SpatialDropout1D, GlobalAveragePooling1D, Concatenate, Reshape
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential

# Build bi-LSTM model
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, 128, input_length=MAX_SEQUENCE_LENGTH))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(2, activation='sigmoid'))
# Set up optimizer
adam = tf.keras.optimizers.Adam(learning_rate=1e-3, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.1)
# Compile bi-LSTM model
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy'])

# Save checkpoint according to val accuracy
filepath="weights.best.final.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

## Start Training...

In [None]:
model.fit(train_data, transfer_train_labels, validation_data=(test_data, transfer_test_labels), epochs=50, batch_size=32, callbacks=callbacks_list, verbose=0)

## Predict testing text files on trained model and Show the accuracy

In [None]:
from sklearn.metrics import accuracy_score
# load weights
model.load_weights("weights.best.final.hdf5")
# evaluate the model
prediction = model.predict(test_data)
prediction = np.argmax(prediction, axis=1)
prediction[text_idx] = 1
acc = accuracy_score(test_labels, prediction)
print('Accuracy on test data: %f' % (acc*100))    


Accuracy on test data: 91.750000


## Predict validation text files on trained model and Save output labels to the csv file

In [None]:
# Trained model accuracy on valudation data
df = pd.read_csv('/content/drive/MyDrive/數位醫學/sample_submission.csv', index_col='Filename')
prediction = model.predict(val_data)
prediction = np.argmax(prediction, axis=1)
df['Obesity'] = prediction
# Find words 
df.loc[val_idx, ['Obesity']] = 1

In [None]:
df

Unnamed: 0_level_0,Obesity
Filename,Unnamed: 1_level_1
ID_1159.txt,1
ID_1160.txt,1
ID_1162.txt,1
ID_1167.txt,1
ID_1168.txt,1
ID_1176.txt,1
ID_1180.txt,0
ID_1183.txt,0
ID_1184.txt,1
ID_1185.txt,1


In [None]:
# Save csv file
df.to_csv('/content/drive/MyDrive/數位醫學/predictions.csv')