In [1]:
import re
import sys
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.sequence import pad_sequences
from keras.layers import LSTM, Dense
from keras.models import Sequential

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


### Initializing the Embeddings (GloVe)

In [2]:
EMBEDDING_FILE = 'glove.42B.300d.txt'
embeddings_index = {}
f = open(EMBEDDING_FILE, encoding='utf-8')
count = 0
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    count += 1
    if count%500000 == 0:
        print(count)
f.close()

500000
1000000
1500000


In [3]:
def get_embeddings(word):
    if word in embeddings_index:
        return embeddings_index[word]
    return np.zeros((300,))

In [4]:
df_train = pd.read_csv('urop_dataset_training.csv')
df_validation = pd.read_csv('urop_dataset_validation.csv')

In [5]:
def make_POD(curr):
    if curr == "POD/brain":
        return "POD"
    return curr

def preprocess(df_train):
    # this field is the class we're trying to predict and hence we have to strip any whitespaces from it
    df_train["Objective Response per RECIST v1.1"] = df_train["Objective Response per RECIST v1.1"].apply(lambda x: make_POD(x.strip()))
    # cleaning scan report text - keep only words and numbers with spaces between them
    df_train["clean_report_text"] = df_train["Scan report text"].apply(lambda text: re.sub('\W+', ' ', text).lower().strip() + str(' '))
    return df_train

In [6]:
df_train = df_train[df_train["Scan included on RECIST form? (y/n)"] == "yes"]

In [7]:
def make_groupings(df_train):
    # group the reports by patient and baseline
    column_patient = 'Patient ID'
    column_baseline = 'Scan timepoint (baseline = prior to treatment start, ontx = during treatment or prior to progression if stopped treatment , progression = time of RECIST defined progression)'
    df_train['is_baseline'] = (df_train[column_baseline] == 'baseline')
    groupped_df = df_train.groupby([column_patient, 'is_baseline'])['clean_report_text'].apply(lambda x: x.sum())
    predictions = df_train.groupby(['Patient ID'])["Objective Response per RECIST v1.1"].first()

    # fill missing reports with nothing
    for i, v in groupped_df.iteritems():
        patient, baseline = i
        if (patient, not baseline) not in groupped_df:
            groupped_df[(patient, not baseline)] = 'insert random word'

    # now create the different dataframes
    groupped_df = groupped_df.to_frame().reset_index()
    baseline_reports = groupped_df[groupped_df['is_baseline'] == True]
    progress_reports = groupped_df[groupped_df['is_baseline'] == False]
    return (baseline_reports, progress_reports, predictions)

In [8]:
def prepare_y(data_y): # DON'T FORGET TO ADD ONE HOT
    label_enc = LabelEncoder()
    label_enc_y = label_enc.fit(data_y.values)
    return label_enc_y.transform(data_y.values.reshape(-1, 1))

In [9]:
def try_model(df_train, baseline_features=5000, progress_features=5000, C=0.1):
    # preprocess dataset
    df_train = preprocess(df_train)
    # second step of data processing
    (baseline_reports, progress_reports, predictions) = make_partial_groupings(df_train)
    # find trainY
    trainY = prepare_y(predictions)
    return (baseline_reports, progress_reports, prepare_y(predictions), C)

In [10]:
df_train = preprocess(df_train)

In [11]:
(baseline_reports, progress_reports, predictions) = make_groupings(df_train)

In [12]:
baseline_reports['word_size'] = baseline_reports['clean_report_text'].apply(lambda x: len(x.split(' ')))
baseline_reports['word_size'].max()

2057

In [13]:
progress_reports['word_size'] = progress_reports['clean_report_text'].apply(lambda x: len(x.split(' ')))
progress_reports['word_size'].max()

26232

In [14]:
baseline_reports['text'] = baseline_reports['clean_report_text'].apply(lambda x: [get_embeddings(word) for word in x.split(' ')])

In [15]:
progress_reports['text'] = progress_reports['clean_report_text'].apply(lambda x: [get_embeddings(word) for word in x.split(' ')])

In [29]:
x = baseline_reports['text'].tolist()
baseline = pad_sequences(x, maxlen=2000, dtype='object', padding='pre', truncating='pre', value=get_embeddings('tseos'))

In [30]:
x = progress_reports['text'].tolist()
progress = pad_sequences(x, maxlen=2000, dtype='object', padding='pre', truncating='pre', value=get_embeddings('tseos'))

In [31]:
def create_model():
    model = Sequential()
    model.add(LSTM(units=300, input_shape=(4000, 300))) # for everything first
    model.add(Dense(4, activation='softmax'))
    
    model.compile(loss = 'categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [32]:
model = create_model()

In [33]:
trainX = np.hstack([baseline, progress])

In [34]:
trainY = predictions

In [35]:
label_enc = LabelEncoder()
label_enc.fit(trainY)
trainY = label_enc.transform(trainY).reshape((-1, 1))
onehot_enc = OneHotEncoder()
onehot_enc.fit(trainY)
trainY = onehot_enc.transform(trainY)

In [36]:
trainY.shape

(361, 4)

In [None]:
model.fit(trainX, trainY, epochs=10, validation_split=0.15, batch_size=40)

Train on 306 samples, validate on 55 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 40/306 [==>...........................] - ETA: 27:23 - loss: 0.8210 - acc: 0.7250