In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import make_scorer, f1_score, roc_curve, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2 


def preprocess_data(data):
    data_original = data.copy()
    data = data.replace('?', np.NaN)

    # Drop unnecessary columns
    # weights, payer_code, diag_1_desc, diag_2_desc, diag_3_desc
    data.drop(labels=['weight', 'payer_code', 'diag_1_desc', 'diag_2_desc', 'diag_3_desc'], axis=1, inplace=True)

    data['diag_1'] = group_diagnoses(data['diag_1'])
    data['diag_2'] = group_diagnoses(data['diag_2'])
    data['diag_3'] = group_diagnoses(data['diag_3'])

    # Encode string data to numericals
    to_cat = list(data.select_dtypes(['object']).columns)
    data[to_cat] = data[to_cat].astype('category')
    cat_columns = data.select_dtypes(['category']).columns
    data[cat_columns] = data[cat_columns].apply(lambda x: x.cat.codes)

    # Get Readmitted as labels
    labels = data['readmitted']
    data.drop(labels=['readmitted'], axis=1, inplace=True)
    # data = data.replace(-1, np.NaN)


    return labels.values.ravel(), data.values


def preprocess_text(text):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(text)
    return WordNetLemmatizer().lemmatize(text)


def extract_textual_features(data, colname):
    corpus = data[colname]
    corpus = corpus.replace(np.NaN, '').values
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=0.0001)
    X = vectorizer.fit_transform(corpus)
    text_features = pd.DataFrame.sparse.from_spmatrix(X)
    return text_features


def group_diagnoses(df):
    # Create mapping from
    l_old = []
    l_new = []

    idx = 0
    tmp_list1 = list(range(390, 460))
    tmp_list1 += [785]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(460, 520))
    tmp_list1 += [786]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(520, 579))
    tmp_list1 += [787]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = [str(i) for i in list(np.arange(250, 251, 0.01))]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(800, 1000)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(710, 740)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(580, 630))
    tmp_list1 += [788]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(140, 240)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    l_old = [str(i) for i in l_old]
    d = dict(zip(l_old, l_new))

    df_new = df.copy()

    df_new = df_new.map(d)
    df_new = df_new.replace(df_new[pd.isna(df_new)], 8)
    df_new = df_new.astype(int)
    return df_new



[nltk_data] Downloading package wordnet to /home/ander/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
data_train = pd.read_csv('task1/data/diab_train.csv', index_col=0)
data_test = pd.read_csv('task1/data/diab_test.csv', index_col=0)
data_validation = pd.read_csv('task1/data/diab_validation.csv', index_col=0)

data_train = pd.concat([data_train, data_validation, data_test], axis=0)
data_train.reset_index(drop=True, inplace=True)
data = data_train.copy()
y, X = preprocess_data(data_train)
print(y.shape, X.shape)
data

(10000,) (10000, 45)


Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,diag_1_desc,diag_2_desc,diag_3_desc
0,AfricanAmerican,Male,[60-70),?,Emergency,Discharged to home,Emergency Room,4,MC,?,...,No,No,No,No,No,No,0,Acute pericarditis in diseases classified else...,Secondary malignant neoplasm of kidney,"Congestive heart failure, unspecified"
1,Caucasian,Female,[70-80),?,Elective,,Physician Referral,1,?,Family/GeneralPractice,...,No,No,No,No,Ch,Yes,1,Malignant essential hypertension,"Spinal stenosis, unspecified region",Diabetes mellitus without mention of complicat...
2,Caucasian,Female,[80-90),?,Urgent,Discharged/transferred to SNF,Emergency Room,2,MC,Emergency/Trauma,...,No,No,No,No,Ch,Yes,0,"Urinary tract infection, site not specified",Streptococcus infection in conditions classifi...,"Congestive heart failure, unspecified"
3,AfricanAmerican,Female,[50-60),?,Emergency,Discharged to home,Emergency Room,4,DM,?,...,No,No,No,No,Ch,Yes,1,"Respiratory abnormality, unspecified","Hypertensive chronic kidney disease, malignant...",Diabetes mellitus without mention of complicat...
4,Caucasian,Male,[80-90),?,Elective,,Physician Referral,13,?,?,...,No,No,No,No,Ch,Yes,1,Coronary atherosclerosis of unspecified type o...,"Chronic airway obstruction, not elsewhere clas...",Malignant essential hypertension
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,?,Male,[40-50),?,Urgent,Discharged to home,Transfer from a hospital,1,UN,Cardiology,...,No,No,No,No,No,Yes,0,Coronary atherosclerosis of unspecified type o...,Diabetes mellitus without mention of complicat...,Pure hypercholesterolemia
9996,Caucasian,Male,[80-90),?,Not Available,Discharged/transferred to SNF,,6,?,?,...,No,No,No,No,No,Yes,0,Atherosclerosis of aorta,Endomyocardial fibrosis,Diabetes mellitus without mention of complicat...
9997,AfricanAmerican,Male,[40-50),?,Elective,,Physician Referral,4,?,Urology,...,No,No,No,No,No,Yes,0,Malignant neoplasm of prostate,Hypertrophy (benign) of prostate without urina...,"Obesity, unspecified"
9998,AfricanAmerican,Male,[50-60),?,Emergency,Discharged to home,Emergency Room,2,?,InternalMedicine,...,No,No,No,No,No,Yes,0,Cellulitis and abscess of face,Diabetes mellitus without mention of complicat...,


In [4]:
#### Create a function for xgboost training.
#### We will call this function later for different types of textual feature extraction

def train_xgboost(X, y):
    X_train = X[:8000]
    y_train = y[:8000]
    y_test, X_test = y[8000:], X[8000:]
    print('X_train shape: {}  and X_test shape: {}'.format(X_train.shape, X_test.shape))

    scoring ={'auroc':    make_scorer(roc_auc_score, greater_is_better=True),
            'f1_score': make_scorer(f1_score, average='micro', greater_is_better=True)}

    xgb_model = xgb.XGBClassifier(scale_pos_weight=2, disable_defeult_eval_metric=0)

    parameters = {
        'objective': ['binary:logistic'],
        'max_depth': [200],
        'min_child_weight': [9],
        'n_estimators': [500],
        'seed': [11],
        'learning_rate': [0.01],
        'max_delta_step': [0],
        'subsample': [0.75]
    }

    clf = GridSearchCV(estimator=xgb_model, param_grid=parameters, n_jobs=3, cv=4, scoring=scoring, verbose=3,
                       refit='f1_score')
    clf.fit(X_train, y_train)

    print('best score: ', clf.best_score_)
    print('best parameters: ', clf.best_params_)

    y_predict = clf.predict(X_test)
    print("0 predictions: {}  1 predictions: {}".format(np.count_nonzero(y_predict == 0) ,np.count_nonzero(y_predict == 1)))

    accuracy = accuracy_score(y_test, y_predict)
    aucroc_score = roc_auc_score(y_test, y_predict)
    f1Score = f1_score(y_test, y_predict)
    print('Accuracy: {} AUCROC: {} F1: {}'.format(accuracy, aucroc_score, f1Score))



In [5]:
##### There are 2 options: Either you can train all the model with features extracted only from primary diagnoses
##### or you can train all the models with features extracted from all 3 description of diagnoses
##### Change the OPTION1 variable appropriately. If you want features from all, set it to False

OPTION1 = False

if OPTION1:
    textdata = data['diag_1_desc']
    textdata = diag1.replace(np.NaN, '').values
else:
    textdata = pd.concat([data['diag_1_desc'], data['diag_2_desc'], data['diag_3_desc']], axis=0)
    textdata.reset_index(inplace=True, drop=True)
    textdata = textdata.replace(np.NaN, ' ').values




# Baseline Model (without NLP)

In [6]:
train_xgboost(X, y)

X_train shape: (8000, 45)  and X_test shape: (2000, 45)
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   4 out of   4 | elapsed:   18.0s finished


KeyboardInterrupt: 

# TF

In [6]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text)
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values

print(text_features)

text_features = text_features.reshape(10000,-1)
print(text_features.shape)

if OPTION1==False:
    dim = int(text_features.shape[1]/3)
    print(dim)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

  'stop_words.' % sorted(inconsistent))


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(10000, 3552)
1184
(10000, 1184) (10000, 1184) (10000, 1184)
(10000, 65)
X_train shape: (8000, 65)  and X_test shape: (2000, 65)
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   4 out of   4 | elapsed:  2.7min finished


best score:  0.62375
best parameters:  {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 200, 'min_child_weight': 9, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 11, 'subsample': 0.75}
0 predictions: 988  1 predictions: 1012
Accuracy: 0.6375 AUCROC: 0.6449530951751605 F1: 0.5983379501385041


# TFIDF

In [7]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text)
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values

print(text_features)

text_features = text_features.reshape(10000,-1)
print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    print(dim)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

  'stop_words.' % sorted(inconsistent))


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(10000, 3552)
(10000, 65)
X_train shape: (8000, 65)  and X_test shape: (2000, 65)
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   4 out of   4 | elapsed:  2.6min finished


best score:  0.624
best parameters:  {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 200, 'min_child_weight': 9, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 11, 'subsample': 0.75}
0 predictions: 973  1 predictions: 1027
Accuracy: 0.643 AUCROC: 0.6523213160723857 F1: 0.6076923076923076


# Bigram as TF

In [8]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text, ngram_range=(2,2))
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values
# print(vectorizer.get_feature_names())
# print(text_features)

text_features = text_features.reshape(10000,-1)
# print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    # print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

  'stop_words.' % sorted(inconsistent))


(10000, 65)
X_train shape: (8000, 65)  and X_test shape: (2000, 65)
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   4 out of   4 | elapsed:  1.8min finished


best score:  0.625125
best parameters:  {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 200, 'min_child_weight': 9, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 11, 'subsample': 0.75}
0 predictions: 966  1 predictions: 1034
Accuracy: 0.6275 AUCROC: 0.6368843578494929 F1: 0.5922276956759716


# Trigram as TF

In [9]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text, ngram_range=(3,3))
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values
# print(vectorizer.get_feature_names())
# print(text_features)

text_features = text_features.reshape(10000,-1)
# print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    #   print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

  'stop_words.' % sorted(inconsistent))


(10000, 65)
X_train shape: (8000, 65)  and X_test shape: (2000, 65)
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   4 out of   4 | elapsed:  1.6min finished


best score:  0.624125
best parameters:  {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 200, 'min_child_weight': 9, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 11, 'subsample': 0.75}
0 predictions: 959  1 predictions: 1041
Accuracy: 0.638 AUCROC: 0.6486113476348037 F1: 0.6052344601962922


# Bigram as TFIDF

In [10]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text, ngram_range=(2,2))
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values
# print(vectorizer.get_feature_names())
# print(text_features)

text_features = text_features.reshape(10000,-1)
print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    # print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

  'stop_words.' % sorted(inconsistent))


(10000, 4383)
(10000, 65)
X_train shape: (8000, 65)  and X_test shape: (2000, 65)
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   4 out of   4 | elapsed:  1.3min finished


best score:  0.6251249999999999
best parameters:  {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 200, 'min_child_weight': 9, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 11, 'subsample': 0.75}
0 predictions: 967  1 predictions: 1033
Accuracy: 0.638 AUCROC: 0.6477462803674655 F1: 0.6035049288061337


# Trigram as TDIDF

In [11]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text, ngram_range=(3,3))
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values
# print(vectorizer.get_feature_names())
# print(text_features)

text_features = text_features.reshape(10000,-1)
# print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
#     print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

  'stop_words.' % sorted(inconsistent))


(10000, 65)
X_train shape: (8000, 65)  and X_test shape: (2000, 65)
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   4 out of   4 | elapsed:  1.2min finished


best score:  0.622125
best parameters:  {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 200, 'min_child_weight': 9, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 11, 'subsample': 0.75}
0 predictions: 977  1 predictions: 1023
Accuracy: 0.637 AUCROC: 0.6456201790522081 F1: 0.6002202643171806


# Word2Vec

In [14]:
from gensim.models import Word2Vec
from nltk import word_tokenize

sentences = [word_tokenize(sentence) for sentence in textdata]
model= Word2Vec(sentences, min_count=1, size=50)



In [15]:
def compute_features(sentence):
    total = 0
    for word in sentence:
        total += model.wv[word]
        
    if len(sentence) == 0:
        return 0
    else:
        return total/len(sentence)

text_features = [compute_features(sentence) for sentence in sentences]
text_features = np.array(text_features)
text_features.shape
text_features = text_features.reshape(10000,-1)
text_features = np.sum(text_features, axis=1)
text_features = np.stack(text_features)
text_features.shape

scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

Xnew = np.concatenate((X, text_features), axis=1)

train_xgboost(Xnew, y)


X_train shape: (8000, 95)  and X_test shape: (2000, 95)
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   4 out of   4 | elapsed:  2.0min finished


best score:  0.620125
best parameters:  {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 200, 'min_child_weight': 9, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 11, 'subsample': 0.75}
0 predictions: 1107  1 predictions: 893
Accuracy: 0.637 AUCROC: 0.6315628359579626 F1: 0.5693950177935944
