In [1]:
import re
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df_train = pd.read_csv('urop_dataset_training.csv')
df_validation = pd.read_csv('urop_dataset_validation.csv')

In [3]:
def make_POD(curr):
    if curr == "POD/brain":
        return "POD"
    return curr

In [4]:
# this field is the class we're trying to predict and hence we have to strip any whitespaces from it
df_train["Objective Response per RECIST v1.1"] = df_train["Objective Response per RECIST v1.1"].apply(lambda x: make_POD(x.strip()))

In [5]:
# cleaning scan report text
df_train["clean_report_text"] = df_train["Scan report text"].apply(lambda text: re.sub('\W+', ' ', text).lower().strip() + str(' '))

In [6]:
temp_df = df_train.groupby(['Patient ID'])['clean_report_text'].apply(lambda x: x.sum())
temp_labels = df_train.groupby(['Patient ID'])["Objective Response per RECIST v1.1"].first()

In [7]:
temp_labels.shape
temp_df.shape

(362,)

In [8]:
text_train_X = temp_df
text_train_y = temp_labels
min_df = 1
ngram_range = (1, 3)
max_features = 600000
label_enc = LabelEncoder()
enc = OneHotEncoder()
stopwords = ['mm', 'dd', '2017', '2016', '2015', '2014', '2013', '2012', 'date', 'md']
countVec = CountVectorizer(min_df = min_df, max_df = max_df, ngram_range = ngram_range, max_features = max_features, stop_words = stopwords)
# Learn vocabulary from train set
countVec.fit(text_train_X)
# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(text_train_X)

In [9]:
label_enc_y = label_enc.fit(text_train_y.values)
trainY = label_enc_y.transform(text_train_y.values.reshape(-1, 1))

  y = column_or_1d(y, warn=True)


In [10]:
lr = LogisticRegression(C=.005, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][0.46575342 0.57534247 0.68493151 0.58333333 0.52112676]
0.5660974982313975


In [11]:
label_enc_y.classes_

array(['CR', 'POD', 'PR', 'SD'], dtype=object)

In [12]:
y_pred = cross_val_predict(lr, trainX, trainY, cv=5)
conf_mat = confusion_matrix(trainY ,y_pred)
f1_score(trainY, y_pred, average='macro')

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

  'precision', 'predicted', average, warn_for)


0.4144565960828619

In [None]:
lr = LogisticRegression(C=0.01, verbose=True)
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

In [None]:
lr = LogisticRegression(C=0.005, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

In [None]:
lr = LogisticRegression(C=0.005, verbose=True)
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

In [None]:
text_train_X = temp_df
text_train_y = temp_labels
min_df = 1
max_df = 0.8
ngram_range = (1, 5)
max_features = 100000
label_enc = LabelEncoder()
enc = OneHotEncoder()
countVec = CountVectorizer(min_df = min_df, max_df=max_df, ngram_range = ngram_range, max_features = max_features)
# Learn vocabulary from train set
countVec.fit(text_train_X)
# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(text_train_X)

In [None]:
lr = LogisticRegression(C=0.005, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

In [None]:
text_train_X = temp_df
text_train_y = temp_labels
min_df = 1
ngram_range = (1, 5)
max_features = 600000
label_enc = LabelEncoder()
enc = OneHotEncoder()
countVec = CountVectorizer(min_df = min_df, ngram_range = ngram_range, max_features = max_features)
# Learn vocabulary from train set
countVec.fit(text_train_X)
# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(text_train_X)

In [None]:
lr = LogisticRegression(C=0.005, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

In [None]:
lr = LogisticRegression(C=0.0005, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

In [13]:
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.2, random_state=42)

In [14]:
lr = LogisticRegression(C=0.0005)
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.9896193771626297
0.5342465753424658


In [15]:
print("Intepreting LR")
for label in range(4):
    coefs = lr.coef_[label]
    vocab = np.array(countVec.get_feature_names())
    num_features = 10

    top = np.argpartition(coefs, -num_features)[-num_features:]
    # Sort top
    top = top[np.argsort(coefs[top])]
    s_coef = coefs[top]
    scored_vocab = list(zip(vocab[top], s_coef))
    print("Top weighted features for label {}:\n \n {}\n -- \n".format(label, scored_vocab))

Intepreting LR
Top weighted features for label 0:
 
 [('11 189', 0.004271394157995591), ('diverticulosis', 0.004328962740914827), ('adenopathy hepatobiliary', 0.004433105676350337), ('metabolic', 0.0047113487954918995), ('nonhypermetabolic', 0.005099495975650554), ('fuqua', 0.005327528047490722), ('james fuqua', 0.005327528047490722), ('no new', 0.006003575957341326), ('resolved', 0.006309755093716532), ('treated', 0.007577274781273187)]
 -- 

Top weighted features for label 1:
 
 [('increased size', 0.009449467374234011), ('hepatic', 0.009503639933352384), ('increased in size', 0.009615268400260302), ('increased in', 0.010310566820362499), ('size of', 0.010395717500313546), ('suv', 0.010443138019139575), ('right pleural', 0.010755590303207564), ('pleural effusion', 0.011923299681765118), ('increase in', 0.014036317942014495), ('lytic', 0.016487857827992124)]
 -- 

Top weighted features for label 2:
 
 [('right lower lobe', 0.009673540835753537), ('resolution', 0.009751904399475994), (

In [16]:
y_pred = lr.predict(X_test)

In [17]:
confusion_matrix(y_test, y_pred)

array([[ 0,  0,  1,  0],
       [ 0, 12,  1, 14],
       [ 0,  1,  7,  6],
       [ 0,  8,  3, 20]], dtype=int64)

In [None]:
def clean_data(report):
    start_string = 'impression'
    stop_string = 'dictated by'
    fallback = 'dictatedby'
    
    start_idx = report.find(start_string)
    start_idx = max(0, start_idx)
    
    stop_idx = report.find(stop_string)
    if stop_idx == -1:
        stop_idx = report.find(fallback)
        if stop_idx == -1:
            stop_idx = len(report)
            
    return report[start_idx:stop_idx]

In [None]:
df_train = pd.read_csv('urop_dataset_training.csv')
df_validation = pd.read_csv('urop_dataset_validation.csv')

In [None]:
def make_POD(curr):
    if curr == "POD/brain":
        return "POD"
    return curr

In [None]:
# this field is the class we're trying to predict and hence we have to strip any whitespaces from it
df_train["Objective Response per RECIST v1.1"] = df_train["Objective Response per RECIST v1.1"].apply(lambda x: make_POD(x.strip()))

In [None]:
# cleaning scan report text
df_train["clean_report_text"] = df_train["Scan report text"].apply(lambda text: clean_data(re.sub('\W+', ' ', text).lower().strip() + str(' ')))

In [None]:
temp_df = df_train.groupby(['Patient ID'])['clean_report_text'].apply(lambda x: x.sum())
temp_labels = df_train.groupby(['Patient ID'])["Objective Response per RECIST v1.1"].first()

In [None]:
text_train_X = temp_df
text_train_y = temp_labels
min_df = 1
ngram_range = (1, 10)
max_features = 1000000
label_enc = LabelEncoder()
enc = OneHotEncoder()
countVec = CountVectorizer(min_df = min_df, ngram_range = ngram_range, max_features = max_features)
# Learn vocabulary from train set
countVec.fit(text_train_X)
# Transform list of review to matrix of bag-of-word vectors
trainX = countVec.transform(text_train_X)

In [None]:
label_enc_y = label_enc.fit(text_train_y.values)
trainY = label_enc_y.transform(text_train_y.values.reshape(-1, 1))

In [None]:
lr = LogisticRegression(C=0.0005, verbose=True, class_weight='balanced')
scores = cross_val_score(lr, trainX, trainY, cv=5)
print(scores)
print(scores.mean())

In [None]:
X_train, X_test, y_train, y_test = train_test_split(trainX, trainY, test_size=0.2, random_state=42)

In [None]:
lr = LogisticRegression(C=0.0007)
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

In [None]:
print("Intepreting LR")
for label in range(4):
    coefs = lr.coef_[label]
    vocab = np.array(countVec.get_feature_names())
    num_features = 10

    top = np.argpartition(coefs, -num_features)[-num_features:]
    # Sort top
    top = top[np.argsort(coefs[top])]
    s_coef = coefs[top]
    scored_vocab = list(zip(vocab[top], s_coef))
    print("Top weighted features for label {}:\n \n {}\n -- \n".format(label, scored_vocab))