In [1]:
import re
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df_train = pd.read_csv('urop_dataset_training.csv')
df_validation = pd.read_csv('urop_dataset_validation.csv')

## Cleaning the report data

### Idea 1
##### 1. Breaking the paragraphs into different reports
##### 2. Giving every paragraph the same class and train on that with bow classifier and ngrams
##### 3. Prediction is made using an average?

### Idea 2
##### 1. Use the paragraphs as a whole for your training
##### 2. Probably too long

### Idea 3
##### 1. Concatenate all the reports from the different visits
##### 2. Even longer text

In [3]:
# this field is the class we're trying to predict and hence we have to strip any whitespaces from it
df_train["Objective Response per RECIST v1.1"] = df_train["Objective Response per RECIST v1.1"].apply(lambda x: x.strip())

In [4]:
temp_df = df_train.groupby('Patient ID')['Scan report text'].apply(lambda x: x.sum())
temp_labels = df_train.groupby('Patient ID')['Objective Response per RECIST v1.1']

In [5]:
# cleaning scan report text
df_train["clean_report_text"] = df_train["Scan report text"].apply(lambda text: re.sub('\W+', ' ', text).lower().strip())

In [6]:
def onehot_encode_column(df, column_name):
    onehot_cols = pd.get_dummies(df[column_name])
    df = df.drop(column_name, axis=1)
    df = df.join(onehot_cols)
    return df

def days_after_start(row):
    start_date = row["Treatment start date"]
    current_date = row["Date of scan"]
    return (datetime.strptime(current_date, '%m/%d/%Y') - datetime.strptime(start_date, '%m/%d/%Y')).days

def clean_dataframe(df):
    # treatment setting to one hot
    df = onehot_encode_column(df, "Treatment setting")
    # scan included on recist form to one hot
    df = onehot_encode_column(df, "Scan included on RECIST form? (y/n)")
    # type of scan to one hot
    df = onehot_encode_column(df, "Type of scan")
    # difference of dates
    df["date_dist"] = df[["Treatment start date", "Date of scan"]].apply(days_after_start, axis=1)
    df["date_dist"] = (df["date_dist"] - df["date_dist"].mean())/df["date_dist"].std()
    # scan timepoint
    scan_timepoint = "Scan timepoint (baseline = prior to treatment start, ontx = during treatment or prior to progression if stopped treatment , progression = time of RECIST defined progression)"
    df[scan_timepoint] = df[scan_timepoint] == "baseline"
    df[scan_timepoint] *= 1
    df = df.drop(["Patient ID", \
                  "PFS censor                              (1 = progressed, 0 = has not progressed)", \
                  "Treatment start date", "Date of scan", \
                  "Date of radiologic progression-free survival (PFS, calculated from start date)", \
                  "Scan type specified", \
                  "Objective Response per RECIST v1.1", \
                  "Scan report text", \
                  "clean_report_text"], axis=1)
    return df

In [7]:
text_train = clean_dataframe(df_train)

In [8]:
text_train_X = df_train["clean_report_text"]
text_train_y = df_train["Objective Response per RECIST v1.1"]
min_df = 2
ngram_range = (1, 3)
max_features = 50000
label_enc = LabelEncoder()
enc = OneHotEncoder()
countVec = CountVectorizer(min_df = min_df, ngram_range = ngram_range, max_features = max_features)
# Learn vocabulary from train set
countVec.fit(text_train_X)
# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(text_train_X)

In [9]:
print(trainX.toarray().shape)
trainX = np.hstack([trainX.toarray(), text_train.values])
print(trainX)

(2250, 50000)
[[ 0.          0.          0.         ...  0.          0.
  -0.7655724 ]
 [ 0.          0.          0.         ...  0.          0.
  -0.46540062]
 [ 0.          0.          0.         ...  1.          0.
  -0.76134463]
 ...
 [ 0.          0.          0.         ...  0.          0.
  -0.34702301]
 [ 0.          0.          0.         ...  0.          0.
  -0.72752246]
 [ 0.          0.          0.         ...  0.          0.
  -0.46962839]]


In [10]:
label_enc_y = label_enc.fit(text_train_y.values)
trainY = label_enc_y.transform(text_train_y.values.reshape(-1, 1))

  y = column_or_1d(y, warn=True)


In [11]:
lr = LogisticRegression(verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.53097345 0.46784922 0.49777778 0.49888641 0.47321429]
0.4937402306040359


In [12]:
lr = LogisticRegression(verbose=True)
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.5420354  0.46563193 0.49777778 0.49665924 0.47321429]
0.49506372670608156


In [13]:
lr = LogisticRegression(C=0.5, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.52876106 0.47006652 0.48888889 0.51224944 0.47544643]
0.49508246829227076


In [14]:
lr = LogisticRegression(C=0.5, verbose=True)
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.53982301 0.46563193 0.49777778 0.50111359 0.47991071]
0.49685140314114307


In [15]:
lr = LogisticRegression(C=0.1, verbose=True, class_weight='balanced', solver='newton-cg', multi_class='multinomial')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.9min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min finished


[0.5199115  0.47006652 0.5        0.51670379 0.46651786]
0.49463993332123585


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.8min finished


In [16]:
lr = LogisticRegression(C=0.1, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.52433628 0.47006652 0.49777778 0.52115813 0.47767857]
0.4982034560830287


In [17]:
lr = LogisticRegression(C=0.1, verbose=True)
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.52876106 0.46563193 0.49333333 0.51002227 0.47544643]
0.49463900492262997


In [18]:
lr = LogisticRegression(C=0.05, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.52876106 0.47006652 0.49555556 0.52783964 0.47544643]
0.49953384171469084


In [19]:
lr = LogisticRegression(C=0.005, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.49557522 0.4789357  0.52222222 0.5233853  0.48214286]
0.5004522599440124


In [20]:
lr = LogisticRegression(C=0.008, verbose=True)
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.50663717 0.49002217 0.50666667 0.51002227 0.47321429]
0.49731251303729385


In [21]:
clf = GradientBoostingClassifier(n_estimators=1000)
#clf.fit(trainX, text_train_y_labels)
scores = cross_val_score(clf, trainX, trainY, cv=5)
print(scores)

KeyboardInterrupt: 