In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import make_scorer, f1_score, roc_curve, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, chi2 


def preprocess_data(data):
    data_original = data.copy()
    data = data.replace('?', np.NaN)

    # Drop unnecessary columns
    # weights, payer_code, diag_1_desc, diag_2_desc, diag_3_desc
    data.drop(labels=['weight', 'payer_code', 'diag_1_desc', 'diag_2_desc', 'diag_3_desc'], axis=1, inplace=True)

    data['diag_1'] = group_diagnoses(data['diag_1'])
    data['diag_2'] = group_diagnoses(data['diag_2'])
    data['diag_3'] = group_diagnoses(data['diag_3'])

    # Encode string data to numericals
    to_cat = list(data.select_dtypes(['object']).columns)
    data[to_cat] = data[to_cat].astype('category')
    cat_columns = data.select_dtypes(['category']).columns
    data[cat_columns] = data[cat_columns].apply(lambda x: x.cat.codes)

    # Get Readmitted as labels
    labels = data['readmitted']
    data.drop(labels=['readmitted'], axis=1, inplace=True)
    # data = data.replace(-1, np.NaN)


    return labels.values.ravel(), data.values


def preprocess_text(text):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(text)
    return WordNetLemmatizer().lemmatize(text)


def extract_textual_features(data, colname):
    corpus = data[colname]
    corpus = corpus.replace(np.NaN, '').values
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=0.0001)
    X = vectorizer.fit_transform(corpus)
    text_features = pd.DataFrame.sparse.from_spmatrix(X)
    return text_features


def group_diagnoses(df):
    # Create mapping from
    l_old = []
    l_new = []

    idx = 0
    tmp_list1 = list(range(390, 460))
    tmp_list1 += [785]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(460, 520))
    tmp_list1 += [786]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(520, 579))
    tmp_list1 += [787]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = [str(i) for i in list(np.arange(250, 251, 0.01))]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(800, 1000)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(710, 740)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(580, 630))
    tmp_list1 += [788]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(140, 240)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    l_old = [str(i) for i in l_old]
    d = dict(zip(l_old, l_new))

    df_new = df.copy()

    df_new = df_new.map(d)
    df_new = df_new.replace(df_new[pd.isna(df_new)], 8)
    df_new = df_new.astype(int)
    return df_new



KeyboardInterrupt: 

In [None]:
data_train = pd.read_csv('task1/data/diab_train.csv', index_col=0)
data_test = pd.read_csv('task1/data/diab_test.csv', index_col=0)
data_validation = pd.read_csv('task1/data/diab_validation.csv', index_col=0)

data_train = pd.concat([data_train, data_validation, data_test], axis=0)
data_train.reset_index(drop=True, inplace=True)
data = data_train.copy()
y, X = preprocess_data(data_train)
print(y.shape, X.shape)
data

In [None]:
#### Create a function for xgboost training.
#### We will call this function later for different types of textual feature extraction

def train_xgboost(X, y):
    X_train = X[:8000]
    y_train = y[:8000]
    y_test, X_test = y[8000:], X[8000:]
    print('X_train shape: {}  and X_test shape: {}'.format(X_train.shape, X_test.shape))

    scoring ={'auroc':    make_scorer(roc_auc_score, greater_is_better=True),
            'f1_score': make_scorer(f1_score, average='micro', greater_is_better=True)}

    xgb_model = xgb.XGBClassifier(scale_pos_weight=2, disable_defeult_eval_metric=0)

    parameters = {
        'objective': ['binary:logistic'],
        'max_depth': [200],
        'min_child_weight': [9],
        'n_estimators': [500],
        'seed': [11],
        'learning_rate': [0.01],
        'max_delta_step': [0],
        'subsample': [0.75]
    }

    clf = GridSearchCV(estimator=xgb_model, param_grid=parameters, n_jobs=3, cv=4, scoring=scoring, verbose=3,
                       refit='f1_score')
    clf.fit(X_train, y_train)

    print('best score: ', clf.best_score_)
    print('best parameters: ', clf.best_params_)

    y_predict = clf.predict(X_test)
    print("0 predictions: {}  1 predictions: {}".format(np.count_nonzero(y_predict == 0) ,np.count_nonzero(y_predict == 1)))

    accuracy = accuracy_score(y_test, y_predict)
    aucroc_score = roc_auc_score(y_test, y_predict)
    f1Score = f1_score(y_test, y_predict)
    print('Accuracy: {} AUCROC: {} F1: {}'.format(accuracy, aucroc_score, f1Score))



In [None]:
##### There are 2 options: Either you can train all the model with features extracted only from primary diagnoses
##### or you can train all the models with features extracted from all 3 description of diagnoses
##### Change the OPTION1 variable appropriately. If you want features from all, set it to False

OPTION1 = False

if OPTION1:
    textdata = data['diag_1_desc']
    textdata = diag1.replace(np.NaN, '').values
else:
    textdata = pd.concat([data['diag_1_desc'], data['diag_2_desc'], data['diag_3_desc']], axis=0)
    textdata.reset_index(inplace=True, drop=True)
    textdata = textdata.replace(np.NaN, ' ').values




# Baseline Model (without NLP)

In [None]:
train_xgboost(X, y)

# TF

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text)
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values

print(text_features)

text_features = text_features.reshape(10000,-1)
print(text_features.shape)

if OPTION1==False:
    dim = int(text_features.shape[1]/3)
    print(dim)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

# TFIDF

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text)
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values

print(text_features)

text_features = text_features.reshape(10000,-1)
print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    print(dim)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

# Bigram as TF

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text, ngram_range=(2,2))
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values
# print(vectorizer.get_feature_names())
# print(text_features)

text_features = text_features.reshape(10000,-1)
# print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    # print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

# Trigram as TF

In [None]:
vectorizer = CountVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text, ngram_range=(3,3))
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values
# print(vectorizer.get_feature_names())
# print(text_features)

text_features = text_features.reshape(10000,-1)
# print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    #   print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

# Bigram as TFIDF

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text, ngram_range=(2,2))
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values
# print(vectorizer.get_feature_names())
# print(text_features)

text_features = text_features.reshape(10000,-1)
print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    # print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

# Trigram as TDIDF

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess_text, ngram_range=(3,3))
text_features = vectorizer.fit_transform(textdata)
text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values
# print(vectorizer.get_feature_names())
# print(text_features)

text_features = text_features.reshape(10000,-1)
# print(text_features.shape)

if OPTION1:
    dim = int(text_features.shape[1]/3)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
#     print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape


selector = SelectKBest(score_func=chi2, k=20)
text_features = selector.fit_transform(text_features, y)
text_features.shape
scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

# now add textual features to the categorical features
Xnew = np.concatenate((X, text_features), axis=1)
print(Xnew.shape)

train_xgboost(Xnew, y)

# Word2Vec

In [None]:
from gensim.models import Word2Vec
from nltk import word_tokenize

sentences = [word_tokenize(sentence) for sentence in textdata]
model= Word2Vec(sentences, min_count=1, size=20)



In [None]:
def compute_features(sentence):
    total = 0
    for word in sentence:
        total += model.wv[word]
        
    if len(sentence) == 0:
        return 0
    else:
        return total/len(sentence)

text_features = [compute_features(sentence) for sentence in sentences]
text_features = np.array(text_features)
text_features.shape
text_features = text_features.reshape(10000,-1)
text_features = np.sum(text_features, axis=1)
text_features = np.stack(text_features)
text_features.shape

scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)

Xnew = np.concatenate((X, text_features), axis=1)

train_xgboost(Xnew, y)
