In [1]:
import re
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df_train = pd.read_csv('urop_dataset_training.csv')
df_validation = pd.read_csv('urop_dataset_validation.csv')

## Cleaning the report data

### Idea 1
##### 1. Breaking the paragraphs into different reports
##### 2. Giving every paragraph the same class and train on that with bow classifier and ngrams
##### 3. Prediction is made using an average?

### Idea 2
##### 1. Use the paragraphs as a whole for your training
##### 2. Probably too long

### Idea 3
##### 1. Concatenate all the reports from the different visits
##### 2. Even longer text

In [3]:
# this field is the class we're trying to predict and hence we have to strip any whitespaces from it
df_train["Objective Response per RECIST v1.1"] = df_train["Objective Response per RECIST v1.1"].apply(lambda x: x.strip())

In [4]:
# cleaning scan report text
df_train["clean_report_text"] = df_train["Scan report text"].apply(lambda text: re.sub('\W+', ' ', text).lower().strip())

In [5]:
def onehot_encode_column(df, column_name):
    onehot_cols = pd.get_dummies(df[column_name])
    df = df.drop(column_name, axis=1)
    df = df.join(onehot_cols)
    return df

def days_after_start(row):
    start_date = row["Treatment start date"]
    current_date = row["Date of scan"]
    return (datetime.strptime(current_date, '%m/%d/%Y') - datetime.strptime(start_date, '%m/%d/%Y')).days

def clean_dataframe(df):
    # treatment setting to one hot
    df = onehot_encode_column(df, "Treatment setting")
    # scan included on recist form to one hot
    df = onehot_encode_column(df, "Scan included on RECIST form? (y/n)")
    # type of scan to one hot
    df = onehot_encode_column(df, "Type of scan")
    # difference of dates
    df["date_dist"] = df[["Treatment start date", "Date of scan"]].apply(days_after_start, axis=1)
    # scan timepoint
    scan_timepoint = "Scan timepoint (baseline = prior to treatment start, ontx = during treatment or prior to progression if stopped treatment , progression = time of RECIST defined progression)"
    df[scan_timepoint] = df[scan_timepoint] == "baseline"
    df[scan_timepoint] *= 1
    df = df.drop(["Patient ID", \
                  "PFS censor                              (1 = progressed, 0 = has not progressed)", \
                  "Treatment start date", "Date of scan", \
                  "Date of radiologic progression-free survival (PFS, calculated from start date)", \
                  "Scan type specified", \
                  "Objective Response per RECIST v1.1", \
                  "Scan report text", \
                  "clean_report_text"], axis=1)
    return df

In [6]:
text_train = clean_dataframe(df_train)

In [7]:
text_train_X = df_train["clean_report_text"]
text_train_y = df_train["Objective Response per RECIST v1.1"]
min_df = 2
ngram_range = (1, 3)
max_features = 10000
label_enc = LabelEncoder()
enc = OneHotEncoder()
countVec = CountVectorizer(min_df = min_df, ngram_range = ngram_range, max_features = max_features)
# Learn vocabulary from train set
countVec.fit(text_train_X)
# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(text_train_X)

In [8]:
print(trainX.toarray().shape)
trainX = np.hstack([trainX.toarray(), text_train.values])
print(trainX)

(2250, 10000)
[[  0   0   0 ...   0   0 -15]
 [  0   0   0 ...   0   0  56]
 [  0   0   0 ...   1   0 -14]
 ...
 [  0   0   0 ...   0   0  84]
 [  0   0   0 ...   0   0  -6]
 [  0   0   0 ...   0   0  55]]


In [9]:
label_enc_y = label_enc.fit(text_train_y.values)
trainY = label_enc_y.transform(text_train_y.values.reshape(-1, 1))

  y = column_or_1d(y, warn=True)


In [10]:
lr = LogisticRegression(C=0.01, solver='saga')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())



[0.50663717 0.46119734 0.48444444 0.48997773 0.45758929]
0.4799691931663041


In [11]:
lr = LogisticRegression(C=0.01, class_weight='balanced', solver='sag', multi_class='multinomial')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())



[0.39380531 0.41463415 0.39777778 0.45434298 0.43080357]
0.41827275793842505




In [12]:
clf = GradientBoostingClassifier(n_estimators=1000)
#clf.fit(trainX, text_train_y_labels)
scores = cross_val_score(clf, trainX, trainY, cv=5)
print(scores)

KeyboardInterrupt: 