### The idea behind this is to split the reports and then concatenate the feature vectors

##### Possible splitting points:
###### - Baseline vs Ontx
###### - Middle

In [1]:
import re
import sys
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# reading and preprocessing the data
df_train = pd.read_csv('urop_dataset_training.csv')
df_validation = pd.read_csv('urop_dataset_validation.csv')

def make_POD(curr):
    if curr == "POD/brain":
        return "POD"
    return curr

def preprocess(df_train):
    # this field is the class we're trying to predict and hence we have to strip any whitespaces from it
    df_train["Objective Response per RECIST v1.1"] = df_train["Objective Response per RECIST v1.1"].apply(lambda x: make_POD(x.strip()))
    # cleaning scan report text - keep only words and numbers with spaces between them
    df_train["clean_report_text"] = df_train["Scan report text"].apply(lambda text: re.sub('\W+', ' ', text).lower().strip() + str(' '))
    return df_train

In [13]:
import warnings
warnings.filterwarnings("ignore")

### Initialize GloVe Embeddings

In [None]:
EMBEDDING_FILE = 'glove.42B.300d.txt'
embeddings_index = {}
f = open(EMBEDDING_FILE, encoding='utf-8')
count = 0
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
    count += 1
    if count%50000 == 0:
        print(count)
f.close()

In [None]:
def get_embeddings(word):
    if word in embeddings_index:
        return embeddings_index[word]
    return np.zeros((300,))

### We have the choice to skip the entries that don't have the Use resist form because they are irrelevant

#### This has increased accuracy of the baseline-split to 73% and for the normal one to 63.5%

In [3]:
df_train = df_train[df_train["Scan included on RECIST form? (y/n)"] == "yes"]

### Split for Baseline vs Ontx

##### Best Accuracy: 70%

In [4]:
def make_groupings(df_train):
    # group the reports by patient and baseline
    column_patient = 'Patient ID'
    column_baseline = 'Scan timepoint (baseline = prior to treatment start, ontx = during treatment or prior to progression if stopped treatment , progression = time of RECIST defined progression)'
    df_train['is_baseline'] = (df_train[column_baseline] == 'baseline')
    groupped_df = df_train.groupby([column_patient, 'is_baseline'])['clean_report_text'].apply(lambda x: x.sum())
    predictions = df_train.groupby(['Patient ID'])["Objective Response per RECIST v1.1"].first()

    # fill missing reports with nothing
    for i, v in groupped_df.iteritems():
        patient, baseline = i
        if (patient, not baseline) not in groupped_df:
            groupped_df[(patient, not baseline)] = 'insert random word'

    # now create the different dataframes
    groupped_df = groupped_df.to_frame().reset_index()
    baseline_reports = groupped_df[groupped_df['is_baseline'] == True]
    progress_reports = groupped_df[groupped_df['is_baseline'] == False]
    return (baseline_reports, progress_reports, predictions)

In [5]:
# learn the bag of words representation for both types of df
def learn_bow(reports, min_df=1, ngram_range=(1, 3), max_features=5000):
    stopwords = ['mm', 'dd', '2017', '2016', '2015', '2014', '2013', '2012', 'date', 'md']
    countVec = CountVectorizer(min_df = min_df, \
                               ngram_range = ngram_range, \
                               max_features = max_features, \
                               stop_words = stopwords)
    countVec.fit(reports)
    return countVec.transform(reports)

In [6]:
def prepare_y(data_y):
    label_enc = LabelEncoder()
    label_enc_y = label_enc.fit(data_y.values)
    return label_enc_y.transform(data_y.values.reshape(-1, 1))

In [7]:
def train_model_lr(trainX, trainY, C=0.1):
    lr = LogisticRegression(C=C, verbose=True, class_weight='balanced')
    scores = cross_val_score(lr, trainX, trainY, cv=5)
    print(scores)
    print(scores.mean())

In [8]:
def train_model_mlp(trainX, trainY, hidden_layer_sizes=(5000, 100)):
    mlp = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes)
    scores = cross_val_score(mlp, trainX, trainY, cv=5)
    print(scores)
    print(scores.mean())

In [14]:
def try_model(df_train, baseline_features=5000, progress_features=5000, C=0.1):
    # preprocess dataset
    df_train = preprocess(df_train)
    # second step of data processing
    (baseline_reports, progress_reports, predictions) = make_groupings(df_train)
    # find trainX data
    baseline_bow = np.array(learn_bow(baseline_reports['clean_report_text'], max_features=baseline_features).todense())
    progress_bow = np.array(learn_bow(progress_reports['clean_report_text'], max_features=progress_features).todense())
    trainX = np.hstack([baseline_bow, progress_bow])
    # find trainY
    trainY = prepare_y(predictions)
    return (trainX, prepare_y(predictions), C)

In [10]:
def make_partial_groupings(df_train):
    # group the reports by patient and baseline
    column_patient = 'Patient ID'
    column_baseline = 'Scan timepoint (baseline = prior to treatment start, ontx = during treatment or prior to progression if stopped treatment , progression = time of RECIST defined progression)'
    df_train['is_baseline'] = (df_train[column_baseline] == 'baseline')
    groupped_df = df_train.groupby([column_patient, 'is_baseline'])['clean_report_text'].apply(lambda x: x.sum())
    predictions = df_train.groupby(['Patient ID'])["Objective Response per RECIST v1.1"].first()

    # fill missing reports with nothing
    for i, v in groupped_df.iteritems():
        patient, baseline = i
        if (patient, not baseline) not in groupped_df:
            groupped_df[(patient, not baseline)] = 'insert random word'

    # now create the different dataframes
    groupped_df = groupped_df.to_frame().reset_index()
    baseline_reports = groupped_df[groupped_df['is_baseline'] == True]
    progress_reports = groupped_df[groupped_df['is_baseline'] == False]
    return (baseline_reports, df_train[df_train['is_baseline'] == False], predictions)

In [11]:
def try_model3(df_train, baseline_features=1000, progress_features=1000, C=0.1):
    # preprocess dataset
    df_train = preprocess(df_train)
    # second step of data processing
    (baseline_reports, progress_reports, predictions) = make_partial_groupings(df_train)
    (progress_reports1, progress_reports2, _) = make_groupings_split(progress_reports)
    # find trainX data
    baseline_bow = np.array(learn_bow(baseline_reports['clean_report_text'], max_features=baseline_features).todense())
    progress_bow1 = np.array(learn_bow(progress_reports1['clean_report_text_x'], max_features=progress_features).todense())
    progress_bow2 = np.array(learn_bow(progress_reports2['clean_report_text_x'], max_features=progress_features).todense())
    trainX = np.hstack([baseline_bow, progress_bow1, progress_bow2])
    # find trainY
    trainY = prepare_y(predictions)
    return (trainX, prepare_y(predictions), C)

In [15]:
for _C in [0.1, 0.07, 0.05, 0.03, 0.01, 0.005]:
    trainX, trainY, C = try_model(df_train, C = _C)
    train_model_lr(trainX, trainY, C = C)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.71232877 0.73972603 0.60273973 0.73611111 0.7       ]
0.6981811263318113
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.71232877 0.73972603 0.60273973 0.73611111 0.7       ]
0.6981811263318113
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.71232877 0.73972603 0.63013699 0.73611111 0.7       ]
0.7036605783866058
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.71232877 0.73972603 0.63013699 0.73611111 0.7       ]
0.7036605783866058
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.69863014 0.75342466 0.65753425 0.75       0.7       ]
0.711917808219178
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.69863014 0.73972603 0.67123288 0.76388889 0.71428571]
0.7175527288540987


In [16]:
for baseline_features in [800, 1600]:
    for progress_features in [2000, 4000]:
        for _C in [0.007, 0.005, 0.003, 0.002]:
            print('baseline: ' + str(baseline_features) + ' || progress: ' + str(progress_features) + ' || C: ' + str(_C))
            trainX, trainY, C = try_model(df_train, baseline_features=baseline_features, progress_features=progress_features, C = _C)
            train_model_lr(trainX, trainY, C = C)

baseline: 800 || progress: 2000 || C: 0.007
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.71232877 0.75342466 0.67123288 0.76388889 0.72857143]
0.7258893237660361
baseline: 800 || progress: 2000 || C: 0.005
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.71232877 0.75342466 0.67123288 0.75       0.72857143]
0.7231115459882583
baseline: 800 || progress: 2000 || C: 0.003
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.71232877 0.7260274  0.67123288 0.75       0.74285714]
0.7204892367906066
baseline: 800 || progress: 2000 || C: 0.002
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.71232877 0.7260274  0.68493151 0.73611111 0.74285714]
0.7204511850402262
baseline: 800 || progress: 4000 || C: 0.007
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.69863014 0.73972603 0.67123288 0.75       0.74285714]
0.7204892367906066
baseline: 800 || progress: 4000 || C: 0.005
[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.69863014 0.73972603 0.6

In [9]:
for baseline_features in [500, 1000]:
    for progress_features in [2000, 4000]:
        print('baseline: ' + str(baseline_features) + ' || progress: ' + str(progress_features))
        trainX, trainY, C = try_model(df_train, baseline_features=baseline_features, progress_features=progress_features)
        train_model_mlp(trainX, trainY, \
                        hidden_layer_sizes=((baseline_features+progress_features)//7, (baseline_features+progress_features)//100))

baseline: 500 || progress: 2000


  y = column_or_1d(y, warn=True)


[0.60273973 0.64383562 0.65753425 0.69444444 0.64788732]
0.6492882714858406
baseline: 500 || progress: 4000


  y = column_or_1d(y, warn=True)


[0.69863014 0.71232877 0.63013699 0.68055556 0.69014085]
0.6823584582073873
baseline: 1000 || progress: 2000


  y = column_or_1d(y, warn=True)


[0.60273973 0.65753425 0.65753425 0.70833333 0.67605634]
0.6604395781079169
baseline: 1000 || progress: 4000


  y = column_or_1d(y, warn=True)


[0.60273973 0.65753425 0.60273973 0.72222222 0.70422535]
0.657892254593007


### Split around the middle

##### Best Accuracy: 59%

In [10]:
# reading and preprocessing the data
df_train = pd.read_csv('urop_dataset_training.csv')
df_validation = pd.read_csv('urop_dataset_validation.csv')

In [54]:
def make_groupings_split(df_train):
    # do some magic here
    df_train['same_patient'] = df_train['Patient ID'].eq(df_train['Patient ID'].shift())
    df_train['report number'] = df_train['same_patient'].cumsum()
    df_train['same_patient'] = df_train['same_patient'].apply(lambda x: np.nan if x == 0 else  x)
    v = df_train['same_patient']
    cumsum = v.cumsum().fillna(method='pad')
    reset = -cumsum[v.isnull()].diff().fillna(cumsum)
    df_train['same_patient'] = v.where(v.notnull(), reset).cumsum()
    df_train['same_patient'].values[0] = 0
    df_groupped = df_train.groupby(['Patient ID'])["clean_report_text"].count()
    df_groupped = df_groupped.to_frame().reset_index()
    result = pd.merge(df_train, df_groupped, how='left', on=['Patient ID'])
    # end of magic
    
    # baseline has the half first of the reports
    # and progress has the other half
    column_patient = 'Patient ID'
    result['splitting_category'] = result['same_patient'] < result['clean_report_text_y']//2
    groupped_df = result.groupby([column_patient, 'splitting_category'])['clean_report_text_x'].apply(lambda x: x.sum())
    predictions = result.groupby(['Patient ID'])["Objective Response per RECIST v1.1"].first()
    
    # fill missing reports with nothing
    for i, v in groupped_df.iteritems():
        patient, category = i
        if (patient, not category) not in groupped_df:
            groupped_df[(patient, not category)] = 'insert random word'
    
    # now create the different dataframes
    groupped_df = groupped_df.to_frame().reset_index()
    baseline_reports = groupped_df[groupped_df['splitting_category'] == True]
    progress_reports = groupped_df[groupped_df['splitting_category'] == False]
    return (baseline_reports, progress_reports, predictions)

In [30]:
def try_model_split(df_train, baseline_features=5000, progress_features=5000, C=0.1):
    # preprocess dataset
    df_train = preprocess(df_train)
    # second step of data processing
    (baseline_reports, progress_reports, predictions) = make_groupings_split(df_train)
    
    # find trainX data
    baseline_bow = np.array(learn_bow(baseline_reports['clean_report_text'], max_features=baseline_features).todense())
    progress_bow = np.array(learn_bow(progress_reports['clean_report_text'], max_features=progress_features).todense())
    trainX = np.hstack([baseline_bow, progress_bow])
    # find trainY
    trainY = prepare_y(predictions)
    return (trainX, prepare_y(predictions), C)

In [40]:
for _C in [0.03, 0.01, 0.007, 0.005, 0.003]:
    trainX, trainY, C = try_model_split4(df_train, C = _C)
    train_model_lr(trainX, trainY, C = C)

  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.43835616 0.53424658 0.50684932 0.52777778 0.58571429]
0.5185888236573168


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.45205479 0.52054795 0.53424658 0.52777778 0.57142857]
0.5212111328549683


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.46575342 0.50684932 0.57534247 0.52777778 0.55714286]
0.5265731680800174


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.47945205 0.49315068 0.57534247 0.52777778 0.55714286]
0.5265731680800174


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.49315068 0.49315068 0.56164384 0.51388889 0.51428571]
0.515223961730811


In [15]:
for baseline_features in [800, 2000]:
    for progress_features in [3000, 5000]:
        for _C in [0.01, 0.007, 0.005, 0.003]:
            print('baseline: ' + str(baseline_features) + ' || progress: ' + str(progress_features) + ' || C: ' + str(_C))
            trainX, trainY, C = try_model_split(df_train, baseline_features=baseline_features, progress_features=progress_features, C = _C)
            train_model_lr(trainX, trainY, C = C)

baseline: 800 || progress: 3000 || C: 0.01


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.57534247 0.65753425 0.64383562 0.58333333 0.68571429]
0.6291519895629485
baseline: 800 || progress: 3000 || C: 0.007


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.54794521 0.67123288 0.64383562 0.58333333 0.7       ]
0.6292694063926941
baseline: 800 || progress: 3000 || C: 0.005


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.56164384 0.68493151 0.64383562 0.59722222 0.7       ]
0.6375266362252663
baseline: 800 || progress: 3000 || C: 0.003


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.56164384 0.67123288 0.64383562 0.58333333 0.7       ]
0.6320091324200913
baseline: 800 || progress: 5000 || C: 0.01


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.56164384 0.65753425 0.64383562 0.59722222 0.68571429]
0.6291900413133289
baseline: 800 || progress: 5000 || C: 0.007


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.56164384 0.68493151 0.64383562 0.59722222 0.7       ]
0.6375266362252663
baseline: 800 || progress: 5000 || C: 0.005


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.56164384 0.67123288 0.67123288 0.58333333 0.71428571]
0.6403457273320288
baseline: 800 || progress: 5000 || C: 0.003


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.56164384 0.67123288 0.67123288 0.58333333 0.68571429]
0.634631441617743
baseline: 2000 || progress: 3000 || C: 0.01


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.57534247 0.64383562 0.63013699 0.55555556 0.67142857]
0.6152598390954555
baseline: 2000 || progress: 3000 || C: 0.007


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.5890411  0.65753425 0.63013699 0.56944444 0.67142857]
0.6235170689280278
baseline: 2000 || progress: 3000 || C: 0.005


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.54794521 0.65753425 0.63013699 0.58333333 0.68571429]
0.6209328114807567
baseline: 2000 || progress: 3000 || C: 0.003


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.56164384 0.67123288 0.67123288 0.56944444 0.68571429]
0.6318536638399653
baseline: 2000 || progress: 5000 || C: 0.01


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.57534247 0.65753425 0.64383562 0.55555556 0.67142857]
0.62073929115025
baseline: 2000 || progress: 5000 || C: 0.007


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.56164384 0.67123288 0.64383562 0.56944444 0.67142857]
0.6235170689280278
baseline: 2000 || progress: 5000 || C: 0.005


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.56164384 0.65753425 0.64383562 0.58333333 0.68571429]
0.6264122635355512
baseline: 2000 || progress: 5000 || C: 0.003


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.54794521 0.67123288 0.67123288 0.56944444 0.68571429]
0.629113937812568


In [17]:
for baseline_features in [500, 1000]:
    for progress_features in [2000, 4000]:
        print('baseline: ' + str(baseline_features) + ' || progress: ' + str(progress_features))
        trainX, trainY, C = try_model_split(df_train, baseline_features=baseline_features, progress_features=progress_features)
        train_model_mlp(trainX, trainY, \
                        hidden_layer_sizes=((baseline_features+progress_features)//7, (baseline_features+progress_features)//100))

baseline: 500 || progress: 2000


  y = column_or_1d(y, warn=True)


[0.54794521 0.60273973 0.57534247 0.63888889 0.65714286]
0.6044118286584039
baseline: 500 || progress: 4000


  y = column_or_1d(y, warn=True)


[0.56164384 0.60273973 0.5890411  0.56944444 0.67142857]
0.5988595346814525
baseline: 1000 || progress: 2000


  y = column_or_1d(y, warn=True)


[0.57534247 0.57534247 0.64383562 0.625      0.7       ]
0.6239041095890411
baseline: 1000 || progress: 4000


  y = column_or_1d(y, warn=True)


[0.53424658 0.60273973 0.65753425 0.59722222 0.62857143]
0.6040628397477713
