In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest

import numpy as np
import pandas as pd


def preprocess_data(data):
    data_original = data.copy()
    data = data.replace('?', np.NaN)

    # Drop unnecessary columns
    # weights, payer_code, diag_1_desc, diag_2_desc, diag_3_desc
    data.drop(labels=['weight', 'payer_code', 'diag_1_desc', 'diag_2_desc', 'diag_3_desc'], axis=1, inplace=True)

    data['diag_1'] = group_diagnoses(data['diag_1'])
    data['diag_2'] = group_diagnoses(data['diag_2'])
    data['diag_3'] = group_diagnoses(data['diag_3'])

    # Encode string data to numericals
    to_cat = list(data.select_dtypes(['object']).columns)
    data[to_cat] = data[to_cat].astype('category')
    cat_columns = data.select_dtypes(['category']).columns
    data[cat_columns] = data[cat_columns].apply(lambda x: x.cat.codes)

    # Get Readmitted as labels
    labels = data['readmitted']
    data.drop(labels=['readmitted'], axis=1, inplace=True)
    # data = data.replace(-1, np.NaN)
   
    #names = ['diag_1_desc', 'diag_2_desc', 'diag_3_desc']
#     for col in names:
#         features = extract_textual_features(data_original, col)
#         data = pd.concat([data, features], axis=1)

    return labels.values.ravel(), data.values


def extract_textual_features(data, colname):
    corpus = data[colname]
    corpus = corpus.replace(np.NaN, '').values
    vectorizer = TfidfVectorizer(stop_words='english', max_df=0.5, min_df=0.0001)
    X = vectorizer.fit_transform(corpus)
    text_features = pd.DataFrame.sparse.from_spmatrix(X)
    return text_features


def group_diagnoses(df):
    # Create mapping from
    l_old = []
    l_new = []

    idx = 0
    tmp_list1 = list(range(390, 460))
    tmp_list1 += [785]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(460, 520))
    tmp_list1 += [786]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(520, 579))
    tmp_list1 += [787]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1

    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = [str(i) for i in list(np.arange(250, 251, 0.01))]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(800, 1000)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(710, 740)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = list(range(580, 630))
    tmp_list1 += [788]
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    tmp_list1 = range(140, 240)
    tmp_list2 = [idx] * len(tmp_list1)
    idx += 1
    l_old = [*l_old, *tmp_list1]
    l_new = [*l_new, *tmp_list2]

    l_old = [str(i) for i in l_old]
    d = dict(zip(l_old, l_new))

    df_new = df.copy()

    df_new = df_new.map(d)
    df_new = df_new.replace(df_new[pd.isna(df_new)], 8)
    df_new = df_new.astype(int)
    return df_new



In [11]:
data_train = pd.read_csv('task1/data/diab_train.csv', index_col=0)
data_test = pd.read_csv('task1/data/diab_test.csv', index_col=0)
data_validation = pd.read_csv('task1/data/diab_validation.csv', index_col=0)

data_train = pd.concat([data_train, data_validation, data_test], axis=0)
data_train.reset_index(drop=True, inplace=True)
data = data_train.copy()
y, X = preprocess_data(data_train)



In [12]:
print(y.shape, X.shape)
data

(10000,) (10000, 45)


Unnamed: 0,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,medical_specialty,...,glipizide.metformin,glimepiride.pioglitazone,metformin.rosiglitazone,metformin.pioglitazone,change,diabetesMed,readmitted,diag_1_desc,diag_2_desc,diag_3_desc
0,AfricanAmerican,Male,[60-70),?,Emergency,Discharged to home,Emergency Room,4,MC,?,...,No,No,No,No,No,No,0,Acute pericarditis in diseases classified else...,Secondary malignant neoplasm of kidney,"Congestive heart failure, unspecified"
1,Caucasian,Female,[70-80),?,Elective,,Physician Referral,1,?,Family/GeneralPractice,...,No,No,No,No,Ch,Yes,1,Malignant essential hypertension,"Spinal stenosis, unspecified region",Diabetes mellitus without mention of complicat...
2,Caucasian,Female,[80-90),?,Urgent,Discharged/transferred to SNF,Emergency Room,2,MC,Emergency/Trauma,...,No,No,No,No,Ch,Yes,0,"Urinary tract infection, site not specified",Streptococcus infection in conditions classifi...,"Congestive heart failure, unspecified"
3,AfricanAmerican,Female,[50-60),?,Emergency,Discharged to home,Emergency Room,4,DM,?,...,No,No,No,No,Ch,Yes,1,"Respiratory abnormality, unspecified","Hypertensive chronic kidney disease, malignant...",Diabetes mellitus without mention of complicat...
4,Caucasian,Male,[80-90),?,Elective,,Physician Referral,13,?,?,...,No,No,No,No,Ch,Yes,1,Coronary atherosclerosis of unspecified type o...,"Chronic airway obstruction, not elsewhere clas...",Malignant essential hypertension
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,?,Male,[40-50),?,Urgent,Discharged to home,Transfer from a hospital,1,UN,Cardiology,...,No,No,No,No,No,Yes,0,Coronary atherosclerosis of unspecified type o...,Diabetes mellitus without mention of complicat...,Pure hypercholesterolemia
9996,Caucasian,Male,[80-90),?,Not Available,Discharged/transferred to SNF,,6,?,?,...,No,No,No,No,No,Yes,0,Atherosclerosis of aorta,Endomyocardial fibrosis,Diabetes mellitus without mention of complicat...
9997,AfricanAmerican,Male,[40-50),?,Elective,,Physician Referral,4,?,Urology,...,No,No,No,No,No,Yes,0,Malignant neoplasm of prostate,Hypertrophy (benign) of prostate without urina...,"Obesity, unspecified"
9998,AfricanAmerican,Male,[50-60),?,Emergency,Discharged to home,Emergency Room,2,?,InternalMedicine,...,No,No,No,No,No,Yes,0,Cellulitis and abscess of face,Diabetes mellitus without mention of complicat...,


In [17]:
#### we tried 2 approaches: 
#### 1) only use data from first diagnoses description
#### 2) use all 3 and take the mean.

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer

def preprocess(text):
    stop_words = set(stopwords.words('english')) 
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words]
    text = ' '.join(text)
    return WordNetLemmatizer().lemmatize(text)



OPTION1= False

if OPTION1:
    ### only take features from diag_desc_1
    diag1 = data['diag_1_desc']

    diag1 = diag1.replace(np.NaN, '').values
    vectorizer = CountVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess)
    text_features = vectorizer.fit_transform(diag1)
    text_features = pd.DataFrame.sparse.from_spmatrix(text_features)
    text_features
else:
    textdata = pd.concat([data['diag_1_desc'], data['diag_2_desc'], data['diag_3_desc']], axis=0)
    textdata.reset_index(inplace=True, drop=True)
    textdata

    from sklearn.feature_extraction.text import TfidfVectorizer
    textdata = textdata.replace(np.NaN, '').values
    vectorizer = CountVectorizer(stop_words='english', max_df=0.5, preprocessor=preprocess)
    text_features = vectorizer.fit_transform(textdata)
    text_features = pd.DataFrame.sparse.from_spmatrix(text_features).values

    print(text_features)

    text_features = text_features.reshape(10000,-1)
    print(text_features.shape)
    # text_features = text_features.toarray()
    dim = int(text_features.shape[1]/3)
    print(dim)
    diag1 = text_features[:,:dim]
    diag2 = text_features[:,dim:dim*2]
    diag3 = text_features[:, 2*dim:]
    
    print(diag1.shape, diag2.shape, diag3.shape)

    text_features = diag1 + diag2 + diag3 
    text_features.shape



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\berka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  'stop_words.' % sorted(inconsistent))


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(10000, 3552)
(10000, 1077) (10000, 1077) (10000, 1398)


ValueError: operands could not be broadcast together with shapes (10000,1077) (10000,1398) 

In [14]:
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler

pca = PCA(n_components = 10)
text_features = pca.fit_transform(text_features)

scaler = StandardScaler()
text_features = scaler.fit_transform(text_features)
text_features.shape

(10000, 10)

In [15]:
X.shape

(10000, 45)

In [7]:
# now add textual features to the categorical features
X = np.concatenate((X, text_features), axis=1)
X.shape

(10000, 55)

In [8]:
# now do xgboost
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import make_scorer, f1_score, roc_curve, roc_auc_score, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
import auxilary

X_train = X[:8000]
y_train = y[:8000]
y_test, X_test = y[8000:], X[8000:]

print('X_train shape: {}  and X_test shape: {}'.format(X_train.shape, X_test.shape))

scoring ={'auroc':    make_scorer(roc_auc_score, greater_is_better=True),
        'f1_score': make_scorer(f1_score, average='micro', greater_is_better=True)}

xgb_model = xgb.XGBClassifier(scale_pos_weight=2.5, disable_defeult_eval_metric=0)
adaboost = AdaBoostClassifier(random_state=42)

parameters_adaboost = {'n_estimators': [50, 100, 500, 1000]}


parameters = {
    'objective': ['binary:logistic'],
    'max_depth': [200],
    'min_child_weight': [9],
    'n_estimators': [500],
    'seed': [11],
    'learning_rate': [0.01],
    'max_delta_step': [0],
    'subsample': [0.75]
}

clf = GridSearchCV(estimator=xgb_model, param_grid=parameters, n_jobs=4, cv=4, scoring=scoring, verbose=3,
                   refit='f1_score')
clf.fit(X_train, y_train)

print('best score: ', clf.best_score_)
print('best parameters: ', clf.best_params_)

y_predict = clf.predict(X_test)
print("0 predictions: {}  1 predictions: {}".format(np.count_nonzero(y_predict == 0) ,np.count_nonzero(y_predict == 1)))

accuracy = accuracy_score(y_test, y_predict)
aucroc_score = roc_auc_score(y_test, y_predict)
f1Score = f1_score(y_test, y_predict)
print('Accuracy: {} AUCROC: {} F1: {}'.format(accuracy, aucroc_score, f1Score))

# best parameters:  {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 10, 'min_child_weight': 5, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 11, 'subsample': 0.75}




X_train shape: (8000, 55)  and X_test shape: (2000, 55)
Fitting 4 folds for each of 1 candidates, totalling 4 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   4 | elapsed:   45.4s finished


best score:  0.62025
best parameters:  {'learning_rate': 0.01, 'max_delta_step': 0, 'max_depth': 200, 'min_child_weight': 9, 'n_estimators': 500, 'objective': 'binary:logistic', 'seed': 11, 'subsample': 0.75}
0 predictions: 894  1 predictions: 1106
Accuracy: 0.6245 AUCROC: 0.6415356615622823 F1: 0.6045286993154292
