### The idea behind this is to split the reports and then concatenate the feature vectors

##### Possible splitting points:
###### - Baseline vs Ontx
###### - Middle

In [1]:
import re
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# reading and preprocessing the data
df_train = pd.read_csv('urop_dataset_training.csv')
df_validation = pd.read_csv('urop_dataset_validation.csv')

def make_POD(curr):
    if curr == "POD/brain" or curr == "SD":
        return "POD"
    if curr == "CR":
        return "PR"
    return curr

def preprocess(df_train):
    # this field is the class we're trying to predict and hence we have to strip any whitespaces from it
    df_train["Objective Response per RECIST v1.1"] = df_train["Objective Response per RECIST v1.1"].apply(lambda x: make_POD(x.strip()))
    # cleaning scan report text - keep only words and numbers with spaces between them
    df_train["clean_report_text"] = df_train["Scan report text"].apply(lambda text: re.sub('\W+', ' ', text).lower().strip() + str(' '))
    return df_train

In [3]:
def make_groupings(df_train):
    # group the reports by patient and baseline
    column_patient = 'Patient ID'
    column_baseline = 'Scan timepoint (baseline = prior to treatment start, ontx = during treatment or prior to progression if stopped treatment , progression = time of RECIST defined progression)'
    df_train['is_baseline'] = (df_train[column_baseline] == 'baseline')
    groupped_df = df_train.groupby([column_patient, 'is_baseline'])['clean_report_text'].apply(lambda x: x.sum())
    predictions = df_train.groupby(['Patient ID'])["Objective Response per RECIST v1.1"].first()

    # fill missing reports with nothing
    for i, v in groupped_df.iteritems():
        patient, baseline = i
        if (patient, not baseline) not in groupped_df:
            groupped_df[(patient, not baseline)] = 'insert random word'

    # now create the different dataframes
    groupped_df = groupped_df.to_frame().reset_index()
    baseline_reports = groupped_df[groupped_df['is_baseline'] == True]
    progress_reports = groupped_df[groupped_df['is_baseline'] == False]
    return (baseline_reports, progress_reports, predictions)

In [5]:
# learn the bag of words representation for both types of df
def learn_bow(reports, min_df=1, ngram_range=(1, 3), max_features=5000):
    stopwords = ["""'mm', 'dd', '2017', '2016', '2015', '2014', '2013', '2012', 'date', 'md'"""]
    countVec = CountVectorizer(min_df = min_df, \
                               ngram_range = ngram_range, \
                               max_features = max_features, \
                               stop_words = stopwords)
    countVec.fit(reports)
    return countVec.transform(reports)

# baseline_bow = np.array(learn_bow(baseline_reports['clean_report_text'], max_features=10000).todense())
#progress_bow = np.array(learn_bow(progress_reports['clean_report_text'], max_features=10000).todense())
# overall_bow = np.hstack([baseline_bow, progress_bow])

In [6]:
def prepare_y(data_y):
    label_enc = LabelEncoder()
    label_enc_y = label_enc.fit(data_y.values)
    return label_enc_y.transform(data_y.values.reshape(-1, 1))

In [7]:
def train_model(trainX, trainY, C=0.1):
    lr = LogisticRegression(C=C, verbose=True, class_weight='balanced')
    scores = cross_val_score(lr, trainX, trainY, cv=5)
    print(scores)
    print(scores.mean())

In [8]:
train_model(overall_bow, prepare_y(predictions), 0.01)

NameError: name 'overall_bow' is not defined

In [11]:
def try_model(df_train, baseline_features=5000, progress_features=5000, C=0.1):
    # preprocess dataset
    df_train = preprocess(df_train)
    # second step of data processing
    (baseline_reports, progress_reports, predictions) = make_groupings(df_train)
    # find trainX data
    baseline_bow = np.array(learn_bow(baseline_reports['clean_report_text'], max_features=baseline_features).todense())
    progress_bow = np.array(learn_bow(progress_reports['clean_report_text'], max_features=progress_features).todense())
    trainX = np.hstack([baseline_bow, progress_bow])
    # find trainY
    trainY = prepare_y(predictions)
    return (trainX, prepare_y(predictions), C)

In [14]:
for _C in [0.03, 0.01, 0.005, 0.002, 0.001]:
    trainX, trainY, C = try_model(df_train, C = _C)
    train_model(trainX, trainY, C = C)

  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.86111111 0.88732394]
0.8291390657491371


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.78082192 0.78082192 0.83561644 0.88888889 0.88732394]
0.8346946213046926


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.82191781 0.78082192 0.84931507 0.875      0.87323944]
0.8400588462280533


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.82191781 0.79452055 0.84931507 0.84722222 0.85915493]
0.8344261152914442


In [13]:
for baseline_features in [2000, 4000, 8000]:
    for progress_features in [2000, 4000, 8000]:
        trainX, trainY, C = try_model(df_train, C = 0.01)
        train_model(trainX, trainY, C = C)

  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915


  y = column_or_1d(y, warn=True)


[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.79452055 0.76712329 0.83561644 0.875      0.88732394]
0.831916843526915
